Spaces:

hoangkha1810
/

Agentic-Document-Extractor

Build error

App Files Files Community

hoangkha1810 commited on Jun 4, 2025

Commit

e15d7ea

verified ·

1 Parent(s): 51b342e

Upload 5 files

Browse files

Files changed (5) hide show

app.py +57 -0
dockerfile +13 -0
model/main.py +30 -0
requirements.txt +87 -0
upload/data.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import uvicorn
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from model.main import process_and_analyze
+from upload.data import download_pdf_from_url, process_uploaded_file
+import shutil
+app = FastAPI(title="Agentic Document Extraction", description="API for extracting and analyzing PDF documents")
+class URLInput(BaseModel):
+    url: str
+@app.post("/upload")
+async def upload_file(file: UploadFile = File(...)):
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed!")
+    # Save uploaded file temporarily
+    file_path = f"/tmp/{file.filename}"
+    try:
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        # Process the uploaded file
+        process_uploaded_file(file_path)
+        if os.path.exists(file_path):
+            process_and_analyze(file_path)
+            return JSONResponse(content={"message": "Processing successful! Check files in data-extractor and file-upload."})
+        else:
+            raise HTTPException(status_code=500, detail="Error processing file!")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
+    finally:
+        # Clean up temporary file
+        if os.path.exists(file_path):
+            os.remove(file_path)
+@app.post("/process-url")
+async def process_url(input: URLInput):
+    if not input.url:
+        raise HTTPException(status_code=400, detail="URL is required!")
+    try:
+        file_path = download_pdf_from_url(input.url)
+        if file_path and os.path.exists(file_path):
+            process_and_analyze(file_path)
+            return JSONResponse(content={"message": "Processing successful! Check files in data-extractor and file-upload."})
+        else:
+            raise HTTPException(status_code=500, detail="Error processing file from URL!")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m spacy download en_core_web_sm
+COPY . .
+EXPOSE 8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

model/main.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+from transformers import pipeline
+from upload.data import process_uploaded_file, extract_text_from_pdf, extract_key_info
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+chatbot = pipeline("conversational", model="microsoft/DialoGPT-medium")
+def summarize_text(text: str) -> str:
+    summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
+    return summary[0]['summary_text']
+def chat_with_document(text: str, user_input: str) -> str:
+    from transformers import Conversation
+    conversation = Conversation(user_input)
+    response = chatbot([conversation], text_context=text)
+    return response.generated_responses[-1]
+def process_and_analyze(file_path: str):
+    data = process_uploaded_file(file_path)
+    text = extract_text_from_pdf(file_path)
+    summary = summarize_text(text)
+    print(f"Tóm tắt: {summary}")
+    while True:
+        user_input = input("Nhập câu hỏi (hoặc 'exit' để thoát): ")
+        if user_input.lower() == 'exit':
+            break
+        response = chat_with_document(text, user_input)
+        print(f"Trả lời: {response}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,87 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+blis==1.3.0
+catalogue==2.0.10
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpathlib==0.21.1
+confection==0.1.5
+cymem==2.0.11
+exceptiongroup==1.3.0
+fastapi==0.115.12
+ffmpy==0.6.0
+filelock==3.18.0
+fsspec==2025.5.1
+gradio==5.32.1
+gradio_client==1.10.2
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.3
+idna==3.10
+Jinja2==3.1.6
+langcodes==3.5.0
+language_data==1.3.0
+marisa-trie==1.2.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+murmurhash==1.0.13
+networkx==3.4.2
+numpy==1.26.4
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.3
+pillow==11.2.1
+preshed==3.0.10
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.12
+safehttpx==0.1.6
+safetensors==0.5.3
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+smart-open==7.1.0
+sniffio==1.3.1
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.5.1
+starlette==0.46.2
+sympy==1.14.0
+thinc==8.3.6
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.2.2
+tqdm==4.67.1
+transformers==4.52.4
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.3
+wasabi==1.1.3
+weasel==0.4.1
+websockets==15.0.1
+wrapt==1.17.2
+fastapi==0.115.0
+uvicorn==0.30.6
+gradio==4.44.0
+pydantic==2.9.2

upload/data.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import shutil
+import PyPDF2
+import spacy
+import requests
+from typing import Dict, List
+# Tải mô hình spaCy
+nlp = spacy.load("en_core_web_sm")
+def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str:
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    file_name = url.split("/")[-1]
+    if not file_name.endswith(".pdf"):
+        file_name += ".pdf"
+    file_path = os.path.join(temp_dir, file_name)
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(file_path, 'wb') as f:
+            for chunk in response.iter_content(1024):
+                f.write(chunk)
+        return file_path
+    else:
+        raise Exception("Không thể tải file từ URL")
+def extract_text_from_pdf(pdf_path: str) -> str:
+    with open(pdf_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+    return text
+def extract_key_info(text: str) -> Dict:
+    doc = nlp(text)
+    patient_info = {}
+    diagnosis = []
+    for ent in doc.ents:
+        if ent.label_ == "PERSON":
+            patient_info["Patient"] = ent.text
+        elif ent.label_ == "DATE":
+            patient_info["Date"] = ent.text
+    if "DIAGNOSIS" in text:
+        start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS")
+        diag_text = text[start_idx:].split("\n")[0].strip()
+        diagnosis.append(diag_text)
+    return {"patient_info": patient_info, "diagnosis": diagnosis}
+def to_markdown(data: Dict) -> str:
+    markdown = "# Patient Information\n"
+    for key, value in data["patient_info"].items():
+        markdown += f"- {key}: {value}\n"
+    markdown += "# Diagnosis\n"
+    for diag in data["diagnosis"]:
+        markdown += f"- {diag}\n"
+    return markdown
+def to_json(data: Dict) -> str:
+    import json
+    return json.dumps(data, indent=2)
+def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"):
+    if not os.path.exists(upload_dir):
+        os.makedirs(upload_dir)
+    shutil.copy(file_path, upload_dir)
+    uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path))
+    text = extract_text_from_pdf(uploaded_file_path)
+    data = extract_key_info(text)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    base_name = os.path.splitext(os.path.basename(file_path))[0]
+    with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file:
+        md_file.write(to_markdown(data))
+    with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file:
+        json_file.write(to_json(data))
+    return data