import os import shutil import PyPDF2 import spacy import requests from typing import Dict, List # Tải mô hình spaCy nlp = spacy.load("en_core_web_sm") def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str: if not os.path.exists(temp_dir): os.makedirs(temp_dir) file_name = url.split("/")[-1] if not file_name.endswith(".pdf"): file_name += ".pdf" file_path = os.path.join(temp_dir, file_name) response = requests.get(url, stream=True) if response.status_code == 200: with open(file_path, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) return file_path else: raise Exception("Không thể tải file từ URL") def extract_text_from_pdf(pdf_path: str) -> str: with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() or "" return text def extract_key_info(text: str) -> Dict: doc = nlp(text) patient_info = {} diagnosis = [] for ent in doc.ents: if ent.label_ == "PERSON": patient_info["Patient"] = ent.text elif ent.label_ == "DATE": patient_info["Date"] = ent.text if "DIAGNOSIS" in text: start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS") diag_text = text[start_idx:].split("\n")[0].strip() diagnosis.append(diag_text) return {"patient_info": patient_info, "diagnosis": diagnosis} def to_markdown(data: Dict) -> str: markdown = "# Patient Information\n" for key, value in data["patient_info"].items(): markdown += f"- {key}: {value}\n" markdown += "# Diagnosis\n" for diag in data["diagnosis"]: markdown += f"- {diag}\n" return markdown def to_json(data: Dict) -> str: import json return json.dumps(data, indent=2) def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"): if not os.path.exists(upload_dir): os.makedirs(upload_dir) shutil.copy(file_path, upload_dir) uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path)) text = extract_text_from_pdf(uploaded_file_path) data = extract_key_info(text) if not os.path.exists(output_dir): os.makedirs(output_dir) base_name = os.path.splitext(os.path.basename(file_path))[0] with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file: md_file.write(to_markdown(data)) with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file: json_file.write(to_json(data)) return data