Spaces:

hoangkha1810
/

Agentic-Document-Extractor

Build error

File size: 2,760 Bytes

e15d7ea

import os
import shutil
import PyPDF2
import spacy
import requests
from typing import Dict, List

# Tải mô hình spaCy
nlp = spacy.load("en_core_web_sm")

def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str:
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    file_name = url.split("/")[-1]
    if not file_name.endswith(".pdf"):
        file_name += ".pdf"
    file_path = os.path.join(temp_dir, file_name)
    
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        return file_path
    else:
        raise Exception("Không thể tải file từ URL")

def extract_text_from_pdf(pdf_path: str) -> str:
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def extract_key_info(text: str) -> Dict:
    doc = nlp(text)
    patient_info = {}
    diagnosis = []
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            patient_info["Patient"] = ent.text
        elif ent.label_ == "DATE":
            patient_info["Date"] = ent.text
    
    if "DIAGNOSIS" in text:
        start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS")
        diag_text = text[start_idx:].split("\n")[0].strip()
        diagnosis.append(diag_text)
    
    return {"patient_info": patient_info, "diagnosis": diagnosis}

def to_markdown(data: Dict) -> str:
    markdown = "# Patient Information\n"
    for key, value in data["patient_info"].items():
        markdown += f"- {key}: {value}\n"
    markdown += "# Diagnosis\n"
    for diag in data["diagnosis"]:
        markdown += f"- {diag}\n"
    return markdown

def to_json(data: Dict) -> str:
    import json
    return json.dumps(data, indent=2)

def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"):
    if not os.path.exists(upload_dir):
        os.makedirs(upload_dir)
    shutil.copy(file_path, upload_dir)
    uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path))
    
    text = extract_text_from_pdf(uploaded_file_path)
    data = extract_key_info(text)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file:
        md_file.write(to_markdown(data))
    with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file:
        json_file.write(to_json(data))
    
    return data