tsuki/nhk-scraper: WIP changes

main
Oystein Kristoffer Tveit 2024-01-23 05:51:37 +01:00
parent 9f2e7f7ac1
commit dd800a3794
Signed by: oysteikt
GPG Key ID: 9F2F7D8250F35146
2 changed files with 92 additions and 22 deletions

View File

@ -6,7 +6,7 @@
in { in {
systemd.services.scrape-nhk-easy-news = { systemd.services.scrape-nhk-easy-news = {
after = [ "network.target" ]; after = [ "network.target" ];
serviceConfig = { serviceConfig = rec {
Type = "oneshot"; Type = "oneshot";
ExecStart = script; ExecStart = script;
DynamicUser = true; DynamicUser = true;
@ -15,6 +15,10 @@ in {
ProtectProc = "invisible"; ProtectProc = "invisible";
ProtectSystem = "strict"; ProtectSystem = "strict";
WorkingDirectory = "/data/scrapers/nhk-easy-news"; WorkingDirectory = "/data/scrapers/nhk-easy-news";
BindPaths = [ WorkingDirectory ];
ReadWritePaths = [ WorkingDirectory ];
StateDirectory = "nhk-easy-news-scraper";
StateDirectoryMode = "0755";
}; };
}; };

View File

@ -1,38 +1,104 @@
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse
from textwrap import dedent, indent
import os import os
import json
import requests import requests
import wget import wget
BASE_URL = "https://www3.nhk.or.jp/news/easy"
def url_filename(url) -> str:
parsed_url = urlparse(url)
path = parsed_url.path
return Path(path).name
def try_download(url, path) -> str | None:
try:
wget.download(url, out=str(path))
except Exception as err:
if path.exists():
os.remove(path)
return err
def download_article(article, dir_path):
news_id = article['news_id']
article_path = dir_path / news_id
if not article_path.exists():
print(f"New article with ID: {news_id}")
os.mkdir(article_path)
index_path = article_path / 'index.html'
if not index_path.exists():
print(" Downloading article")
err = try_download(article['news_web_url'], index_path)
if err is not None:
print(dedent(f'''
Failed to download article {news_id}:
{article['news_web_url']}
{indent(str(err), ' ')}
'''))
return
info_path = article_path / 'info.json'
if not info_path.exists():
print(" Exporting metadata")
with open(info_path, 'w') as file:
json.dump(article, file)
for toggle, url_attr in (
('has_news_web_image', 'news_web_image_uri'),
# ('has_news_web_movie', 'news_web_movie_uri'),
# ('has_news_easy_image', 'news_easy_image_uri'),
# ('has_news_easy_movie', 'news_easy_movie_uri'),
# ('has_news_easy_voice', 'news_easy_voice_uri'),
):
if not article[toggle]:
continue
url = article[url_attr]
# if not url.startswith('http'):
# url = BASE_URL + '/' + url
path = article_path / url_filename(url)
if path.exists():
continue
print(f' Downloading supplementary material: {url_filename(url)}')
err = try_download(url, path)
if err is not None:
print(dedent(f'''
Failed to download supplementary material for article {news_id}:
{url}
{indent(str(err), ' ')}
'''))
def main(): def main():
nhkjson = requests.get( print("Starting nhk easy news scraper")
'http://www3.nhk.or.jp/news/easy/news-list.json').json() print()
print("Fetching article index...")
nhkjson = requests.get(BASE_URL + '/news-list.json').json()
base_dir = Path(".").resolve() base_dir = Path(".").resolve()
print('Got article index')
if not (base_dir / 'articles').exists(): if not (base_dir / 'articles').exists():
os.mkdir(base_dir / 'articles') os.mkdir(base_dir / 'articles')
for key, value in nhkjson[0].items(): for date, articlelist in nhkjson[0].items():
for x in value: date_dir = base_dir / 'articles' / date
news_id = x['news_id'] if not date_dir.exists():
path = base_dir / f'articles/nhkeasy_{news_id}.html' print(f"Found new articles for {date}")
os.mkdir(date_dir)
if path.exists(): for article in articlelist:
# This means that the article has already been downloaded. download_article(article, date_dir)
# Skip and continue
continue
print(f"New article with ID: {news_id}")
try:
nhkurl = x['news_web_url']
wget.download(nhkurl, out=str(path))
print("Successful download of article ID: " + x['news_id'])
except Exception as err:
if path.exists():
os.remove(path)
print("Failed to download article ID: " + x['news_id'])
print(err)
if __name__ == '__main__': if __name__ == '__main__':