TDT4310-project-sorted-japa.../main.py

191 lines
5.9 KiB
Python

from argparse import ArgumentParser, Namespace
from pathlib import Path
from itertools import chain
import os
from src.frontend.flaskapp import create_app
from src.database import connect_to_database
from src.data_ingestion import (
ingest_jmdict,
ingest_nhk_easy_news_articles,
ingest_tatoeba_sentences
)
from src.models import (
Base,
jmdict_tables,
nhk_tables,
tatoeba_tables,
)
from src.models import (
TatoebaSentencePairUsesJMDictEntry,
NHKEasyNewsArticleUsesJMDictEntry,
)
from src.processing import *
arg_parser = ArgumentParser()
def dir_path(string) -> Path:
if os.path.isdir(string):
return Path(string)
else:
return NotADirectoryError(string)
arg_parser.add_argument('--reset-all', '-r', action='store_true', help='Reset and recalculate all data')
arg_parser.add_argument('--reset-jmdict', '-rj', action='store_true', help='Reingest data from jmdict')
arg_parser.add_argument('--reset-nhk-articles', '-rna', action='store_true', help='Reingest data from nhk')
arg_parser.add_argument('--reset-tatoeba', '-rt', action='store_true', help='Reingest data from tatoeba sentences')
arg_parser.add_argument('--reset-sentence-matches', '-rs', action='store_true', help='Recalculate relations between sentences and jmdict entries')
arg_parser.add_argument('--reset-nhk-statistics', '-rns', action='store_true', help='Recalculate relations between sentences and jmdict entries')
arg_parser.add_argument('--reset-difficulty-values', '-rd', action='store_true', help='Recalculate difficulty of jmdict entries and sentences')
# arg_parser.add_argument('--redownload-jmdict', '-Rj', action='store_true', help='Redownload data from jmdict')
# arg_parser.add_argument('--redownload-nhk', '-Rn', action='store_true', help='Redownload data from nhk')
# arg_parser.add_argument('--redownload-tatoeba', '-Rt', action='store_true', help='Redownload data from tatoeba sentences')
arg_parser.add_argument('--echo-sql', '-e', action='store_true', help='Echo SQL statements')
arg_parser.add_argument('--data-dir', '-d', type=dir_path, default=Path('./data'), help='Directory where data is stored')
arg_parser.add_argument('--use-memory-db', '-m', action='store_true', help='Use an in-memory database for debugging purposes')
def _reingest_specified_tables(args: Namespace) -> None:
tables = [
jmdict_tables if args.reset_jmdict else [],
nhk_tables if args.reset_nhk_articles else [],
tatoeba_tables if args.reset_tatoeba else [],
]
tables = list(chain.from_iterable(tables))
tables = [table.__table__ for table in tables]
session_maker = connect_to_database(
args.data_dir,
args.echo_sql,
args.use_memory_db,
)
with session_maker() as session:
Base.metadata.drop_all(session.get_bind(), tables)
Base.metadata.create_all(session.get_bind(), tables)
if args.reset_jmdict:
ingest_jmdict(session, args.data_dir)
if args.reset_nhk_articles:
ingest_nhk_easy_news_articles(session, args.data_dir)
if args.reset_tatoeba:
ingest_tatoeba_sentences(session, args.data_dir)
def _reprocess_sentence_entry_matches(args: Namespace) -> None:
session_maker = connect_to_database(
args.data_dir,
args.echo_sql,
args.use_memory_db,
)
with session_maker() as session:
if any([
args.reset_jmdict,
args.reset_nhk_articles,
args.reset_sentence_matches,
]):
Base.metadata.drop_all(session.get_bind(), [NHKEasyNewsArticleUsesJMDictEntry.__table__])
Base.metadata.create_all(session.get_bind(), [NHKEasyNewsArticleUsesJMDictEntry.__table__])
connect_nhk_easy_news_articles_to_jmdict_entries(session)
if any([
args.reset_jmdict,
args.reset_tatoeba,
args.reset_sentence_matches,
]):
Base.metadata.drop_all(session.get_bind(), [TatoebaSentencePairUsesJMDictEntry.__table__])
Base.metadata.create_all(session.get_bind(), [TatoebaSentencePairUsesJMDictEntry.__table__])
connect_tatoeba_sentences_to_jmdict_entries(session)
def _recalculate_nhk_statistics(args: Namespace) -> None:
session_maker = connect_to_database(
args.data_dir,
args.echo_sql,
args.use_memory_db,
)
with session_maker() as session:
calculate_word_frequency_of_nhk_easy_news_articles(session)
def _recalculate_difficulty_values(args: Namespace) -> None:
session_maker = connect_to_database(
args.data_dir,
args.echo_sql,
args.use_memory_db,
)
with session_maker() as session:
calculate_difficulty_values_of_all_words_and_sentences(session)
if __name__ == "__main__":
args = arg_parser.parse_args()
if args.reset_all:
args.reset_jmdict = True
args.reset_nhk_articles = True
args.reset_tatoeba = True
args.reset_sentence_matches = True
args.reset_nhk_statistics = True
args.reset_difficulty_values = True
# if args.redownload:
# print('Redownloading data...')
# TODO: download jmdict, nhk easy news, tanaka corpus
# pass
changed_database = False
if any([
args.reset_jmdict,
args.reset_nhk_articles,
args.reset_tatoeba,
]):
changed_database = True
_reingest_specified_tables(args)
if any([
args.reset_jmdict,
args.reset_nhk_articles,
args.reset_tatoeba,
args.reset_sentence_matches,
]):
changed_database = True
_reprocess_sentence_entry_matches(args)
if any([
args.reset_nhk_articles,
args.reset_nhk_statistics,
]):
changed_database = True
_recalculate_nhk_statistics(args)
if any([
args.reset_jmdict,
args.reset_nhk_articles,
args.reset_tatoeba,
args.reset_sentence_matches,
args.reset_nhk_statistics,
args.reset_difficulty_values,
]):
changed_database = True
_recalculate_difficulty_values(args)
if changed_database:
print('Recreated the database with newly processed data!')
print('You can now run the app without the --reset-db flag to start the webserver.')
else:
app = create_app(args)
app.run(debug=True)