hoangkha1810's picture
Upload 5 files
e15d7ea verified
import os
import shutil
import PyPDF2
import spacy
import requests
from typing import Dict, List
# Tải mô hình spaCy
nlp = spacy.load("en_core_web_sm")
def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str:
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
file_name = url.split("/")[-1]
if not file_name.endswith(".pdf"):
file_name += ".pdf"
file_path = os.path.join(temp_dir, file_name)
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(file_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return file_path
else:
raise Exception("Không thể tải file từ URL")
def extract_text_from_pdf(pdf_path: str) -> str:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def extract_key_info(text: str) -> Dict:
doc = nlp(text)
patient_info = {}
diagnosis = []
for ent in doc.ents:
if ent.label_ == "PERSON":
patient_info["Patient"] = ent.text
elif ent.label_ == "DATE":
patient_info["Date"] = ent.text
if "DIAGNOSIS" in text:
start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS")
diag_text = text[start_idx:].split("\n")[0].strip()
diagnosis.append(diag_text)
return {"patient_info": patient_info, "diagnosis": diagnosis}
def to_markdown(data: Dict) -> str:
markdown = "# Patient Information\n"
for key, value in data["patient_info"].items():
markdown += f"- {key}: {value}\n"
markdown += "# Diagnosis\n"
for diag in data["diagnosis"]:
markdown += f"- {diag}\n"
return markdown
def to_json(data: Dict) -> str:
import json
return json.dumps(data, indent=2)
def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"):
if not os.path.exists(upload_dir):
os.makedirs(upload_dir)
shutil.copy(file_path, upload_dir)
uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path))
text = extract_text_from_pdf(uploaded_file_path)
data = extract_key_info(text)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
base_name = os.path.splitext(os.path.basename(file_path))[0]
with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file:
md_file.write(to_markdown(data))
with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file:
json_file.write(to_json(data))
return data