Spaces:

hoangkha1810
/

Agentic-Document-Extractor

Build error

App Files Files Community

Agentic-Document-Extractor / upload /data.py

hoangkha1810

Upload 5 files

e15d7ea verified 7 months ago

raw

history blame contribute delete

2.76 kB

	import os
	import shutil
	import PyPDF2
	import spacy
	import requests
	from typing import Dict, List

	# Tải mô hình spaCy
	nlp = spacy.load("en_core_web_sm")

	def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str:
	if not os.path.exists(temp_dir):
	os.makedirs(temp_dir)
	file_name = url.split("/")[-1]
	if not file_name.endswith(".pdf"):
	file_name += ".pdf"
	file_path = os.path.join(temp_dir, file_name)

	response = requests.get(url, stream=True)
	if response.status_code == 200:
	with open(file_path, 'wb') as f:
	for chunk in response.iter_content(1024):
	f.write(chunk)
	return file_path
	else:
	raise Exception("Không thể tải file từ URL")

	def extract_text_from_pdf(pdf_path: str) -> str:
	with open(pdf_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	return text

	def extract_key_info(text: str) -> Dict:
	doc = nlp(text)
	patient_info = {}
	diagnosis = []

	for ent in doc.ents:
	if ent.label_ == "PERSON":
	patient_info["Patient"] = ent.text
	elif ent.label_ == "DATE":
	patient_info["Date"] = ent.text

	if "DIAGNOSIS" in text:
	start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS")
	diag_text = text[start_idx:].split("\n")[0].strip()
	diagnosis.append(diag_text)

	return {"patient_info": patient_info, "diagnosis": diagnosis}

	def to_markdown(data: Dict) -> str:
	markdown = "# Patient Information\n"
	for key, value in data["patient_info"].items():
	markdown += f"- {key}: {value}\n"
	markdown += "# Diagnosis\n"
	for diag in data["diagnosis"]:
	markdown += f"- {diag}\n"
	return markdown

	def to_json(data: Dict) -> str:
	import json
	return json.dumps(data, indent=2)

	def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"):
	if not os.path.exists(upload_dir):
	os.makedirs(upload_dir)
	shutil.copy(file_path, upload_dir)
	uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path))

	text = extract_text_from_pdf(uploaded_file_path)
	data = extract_key_info(text)

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	base_name = os.path.splitext(os.path.basename(file_path))[0]
	with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file:
	md_file.write(to_markdown(data))
	with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file:
	json_file.write(to_json(data))

	return data