First of all, we need to extract all the data packages from Github. We're going to look at all the repos having datapackage.json
in the root directory. Github Search API has quite strict querying limits so we have to use different techniques to avoid rate limit errors:
$ python code/extract.py
import os
import json
import time
from dotenv import load_dotenv
from github import Github, RateLimitExceededException
from frictionless import Package, Resource, FrictionlessException
load_dotenv()
PAUSE = 1
RETRY = 10
QUERY = "resources filename:datapackage.json path:/"
github = Github(os.environ["GITHUB_TOKEN"], per_page=100)
# Helpers
def search_items():
items = []
results = github.search_code(QUERY)
time.sleep(PAUSE)
page_number = 0
while True:
try:
page = results.get_page(page_number)
except RateLimitExceededException:
time.sleep(RETRY)
continue
time.sleep(PAUSE)
page_number += 1
if not page:
break
for result in page:
repo = result.repository
item = {}
item["code"] = "-".join([repo.owner.login, repo.name])
item["user"] = repo.owner.login
item["repo"] = repo.name
item["branch"] = repo.default_branch
item["path"] = result.path
item["stars"] = repo.stargazers_count
item["download_url"] = result.download_url
try:
package = Package(json.loads(result.decoded_content))
item["title"] = package.title
item["description"] = package.description_text
item["content"] = json.dumps(package.to_dict())
except (json.JSONDecodeError, FrictionlessException):
continue
items.append(item)
print(f"Found items: {len(items)}")
return items
# General
resource = Resource(search_items())
resource.write("data/packages.raw.csv")
As a high-level data collections framework, we will use Frictionless Transform. It will sort the packages by repository's stargazers count and save it to the CSV file:
$ python code/transform.py
from frictionless import Resource, transform, steps
# General
transform(
Resource("data/packages.raw.csv"),
steps=[
steps.table_normalize(),
steps.row_sort(field_names=["stars"], reverse=True),
steps.table_write(path="data/packages.csv"),
],
)
After we have the packages.csv
file filled with data packages, we need to load them as Livemark Cards. To achieve this task we will use builtin methods that comes with the Cards plugin:
$ python code/load.py
from frictionless import Resource, Package
from livemark.plugins.cards import CardsPlugin
# General
CardsPlugin.delete_cards()
with Resource("data/packages.csv") as resource:
for row in resource:
code = row["code"]
package = Package(row["content"])
CardsPlugin.create_card("cards/package.md", code=code, package=package)