Spaces:
Build error
Build error
| import os | |
| import shutil | |
| import PyPDF2 | |
| import spacy | |
| import requests | |
| from typing import Dict, List | |
| # Tải mô hình spaCy | |
| nlp = spacy.load("en_core_web_sm") | |
| def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str: | |
| if not os.path.exists(temp_dir): | |
| os.makedirs(temp_dir) | |
| file_name = url.split("/")[-1] | |
| if not file_name.endswith(".pdf"): | |
| file_name += ".pdf" | |
| file_path = os.path.join(temp_dir, file_name) | |
| response = requests.get(url, stream=True) | |
| if response.status_code == 200: | |
| with open(file_path, 'wb') as f: | |
| for chunk in response.iter_content(1024): | |
| f.write(chunk) | |
| return file_path | |
| else: | |
| raise Exception("Không thể tải file từ URL") | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| return text | |
| def extract_key_info(text: str) -> Dict: | |
| doc = nlp(text) | |
| patient_info = {} | |
| diagnosis = [] | |
| for ent in doc.ents: | |
| if ent.label_ == "PERSON": | |
| patient_info["Patient"] = ent.text | |
| elif ent.label_ == "DATE": | |
| patient_info["Date"] = ent.text | |
| if "DIAGNOSIS" in text: | |
| start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS") | |
| diag_text = text[start_idx:].split("\n")[0].strip() | |
| diagnosis.append(diag_text) | |
| return {"patient_info": patient_info, "diagnosis": diagnosis} | |
| def to_markdown(data: Dict) -> str: | |
| markdown = "# Patient Information\n" | |
| for key, value in data["patient_info"].items(): | |
| markdown += f"- {key}: {value}\n" | |
| markdown += "# Diagnosis\n" | |
| for diag in data["diagnosis"]: | |
| markdown += f"- {diag}\n" | |
| return markdown | |
| def to_json(data: Dict) -> str: | |
| import json | |
| return json.dumps(data, indent=2) | |
| def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"): | |
| if not os.path.exists(upload_dir): | |
| os.makedirs(upload_dir) | |
| shutil.copy(file_path, upload_dir) | |
| uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path)) | |
| text = extract_text_from_pdf(uploaded_file_path) | |
| data = extract_key_info(text) | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| base_name = os.path.splitext(os.path.basename(file_path))[0] | |
| with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file: | |
| md_file.write(to_markdown(data)) | |
| with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file: | |
| json_file.write(to_json(data)) | |
| return data |