Spaces:

kyauy
/

ClinFly

Sleeping

File size: 8,417 Bytes

import csv
import os
import argparse
import pandas as pd
from utilities.anonymize import (
    get_cities_list,
    get_abbreviation_dict_correction,
    reformat_to_report,
    anonymize_analyzer,
    anonymize_engine,
    add_space_to_comma_endpoint,
    get_list_not_deidentify,
    config_deidentify,
)
from utilities.translate import get_translation_dict_correction, translate_report
from utilities.convert import (
    convert_df_no_header,
    convert_df,
    convert_json,
    convert_list_phenogenius,
    convert_pdf_to_text,
)
from utilities.extract_hpo import add_biometrics, extract_hpo
from utilities.get_model import get_models, get_nlp_marian
import gc


def main():

    print("Code Starting")
    MarianText, _, _ = translate_report(
        Report,
        Last_name,
        First_name,
        nlp_fr,
        marian_fr_en,
        dict_correction,
        dict_abbreviation_correction,
    )
    MarianText_report = reformat_to_report(MarianText, nlp_fr)
    del MarianText

    print("Translation and De-identification")
    (
        MarianText_anonymize_report_analyze,
        analyzer_results_return,
        _,
        _,
    ) = anonymize_analyzer(
        MarianText_report, analyzer, proper_noun, Last_name, First_name
    )

    print(MarianText_anonymize_report_analyze)

    MarianText_anonymize_report_engine = anonymize_engine(
        MarianText_report, analyzer_results_return, engine, nlp_fr
    )

    MarianText_anonymize_report_engine_modif = pd.DataFrame(
        [x for x in MarianText_anonymize_report_engine.split("\n")]
    )

    MarianText_anonymize_report_engine_df = MarianText_anonymize_report_engine_modif
    with open(
        os.path.join(args.result_dir, "Reports", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_translated_and_deindentified_report.txt",
        "w",
    ) as file:
        file.write(
            convert_df_no_header(MarianText_anonymize_report_engine_df).decode("utf-8")
        )
    print(
        "Text file created successfully : "
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_translated_and_deindentified_report.txt"
    )

    print("Summarization")

    MarianText_anonymized_reformat_space = add_space_to_comma_endpoint(
        MarianText_anonymize_report_engine, nlp_fr
    )
    MarianText_anonymized_reformat_biometrics, _ = add_biometrics(
        MarianText_anonymized_reformat_space, nlp_fr
    )
    clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics)

    del MarianText_anonymize_report_engine
    del MarianText_anonymized_reformat_space
    del MarianText_anonymized_reformat_biometrics

    clinphen_unsafe_check_raw = clinphen_unsafe
    clinphen_unsafe_check_raw["To keep in list"] = False
    clinphen_unsafe_check_raw["Confidence on extraction"] = "low"

    del clinphen_unsafe

    clinphen["Confidence on extraction"] = "high"
    clinphen["To keep in list"] = True

    cols = [
        "HPO ID",
        "Phenotype name",
        "To keep in list",
        "No. occurrences",
        "Earliness (lower = earlier)",
        "Confidence on extraction",
        "Example sentence",
    ]
    clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index()
    clinphen_all = clinphen_all[cols]

    clinphen_df = clinphen_all
    clinphen_df_without_low_confidence = clinphen_df[
        clinphen_df["To keep in list"] == True
    ]
    del clinphen
    del clinphen_unsafe_check_raw
    gc.collect()

    with open(
        os.path.join(args.result_dir, "TSV", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_summarized_report.tsv",
        "w",
    ) as file:
        file.write(convert_df(clinphen_df).decode("utf-8"))
    print(
        "Tsv file created successfully : "
        + os.path.join(args.result_dir, "TSV", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_summarized_report.tsv"
    )

    with open(
        os.path.join(args.result_dir, "JSON", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_summarized_report.json",
        "w",
    ) as file:
        file.write(convert_json(clinphen_df_without_low_confidence))
    print(
        "JSON file created successfully : "
        + os.path.join(args.result_dir, "JSON", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_summarized_report.json"
    )

    with open(
        os.path.join(args.result_dir, "TXT", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_summarized_report.txt",
        "w",
    ) as file:
        file.write(convert_list_phenogenius(clinphen_df_without_low_confidence))
    print(
        "Text file created successfully : "
        + os.path.join(args.result_dir, "TXT", "")
        + Report_id
        + "_"
        + Last_name
        + "_"
        + First_name
        + "_summarized_report.txt"
    )


if __name__ == "__main__":

    print("Welcome to the Clinfly app")

    parser = argparse.ArgumentParser(description="Description of clinfly arguments")
    parser.add_argument(
        "--file",
        type=str,
        help="the input file which contains the visits informations",
        required=True,
    )
    parser.add_argument(
        "--language",
        choices=["fr", "es", "de"],
        type=str,
        help="The language of the input : fr, es , de",
        required=True,
    )
    parser.add_argument(
        "--model_dir",
        default=os.path.expanduser("~"),
        type=str,
        help="The directory where the models will be downloaded.",
    )
    parser.add_argument(
        "--result_dir",
        default="Results",
        type=str,
        help="The directory where the results will be placed.",
    )

    args = parser.parse_args()

    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    if not os.path.exists(args.result_dir):
        os.makedirs(args.result_dir)

    if not os.path.exists(os.path.join(args.result_dir, "Reports")):
        os.makedirs(os.path.join(args.result_dir, "Reports"))

    if not os.path.exists(os.path.join(args.result_dir, "TSV")):
        os.makedirs(os.path.join(args.result_dir, "TSV"))

    if not os.path.exists(os.path.join(args.result_dir, "JSON")):
        os.makedirs(os.path.join(args.result_dir, "JSON"))

    if not os.path.exists(os.path.join(args.result_dir, "TXT")):
        os.makedirs(os.path.join(args.result_dir, "TXT"))

    print("Language chosen :", args.language)
    models_status = get_models(args.language, args.model_dir)
    dict_correction = get_translation_dict_correction()
    dict_abbreviation_correction = get_abbreviation_dict_correction()
    proper_noun = get_list_not_deidentify()
    cities_list = get_cities_list()
    analyzer, engine = config_deidentify(cities_list)
    nlp_fr, marian_fr_en = get_nlp_marian(args.language)

    file_name = args.file
    Report_id: str
    Last_name: str
    First_name: str
    Report: str

    if os.path.isfile(args.file):
        with open(file_name, 'r') as fichier:
          for ligne in fichier:
            elements = ligne.strip().split('\t')
            Report_id, Last_name, First_name, text_or_link = elements
            print("Report_id:", Report_id)
            print("Last_name:", Last_name)
            print("First_name:", First_name)
            if os.path.exists(text_or_link):
                if text_or_link.lower().endswith('.pdf'):
                    print(f"Processing PDF file: {text_or_link}")
                    Report = convert_pdf_to_text(text_or_link)
                else:
                    print(f"Unsupported file type. Please provide a link to a PDF files.")
            else:
                Report = text_or_link
                print("Report:", Report)
            main()
            print()
    else:
        print("Input is not a file. Please provide a valid input.")