hoangkha1810 commited on
Commit
e15d7ea
·
verified ·
1 Parent(s): 51b342e

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +57 -0
  2. dockerfile +13 -0
  3. model/main.py +30 -0
  4. requirements.txt +87 -0
  5. upload/data.py +84 -0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uvicorn
3
+ from fastapi import FastAPI, File, UploadFile, HTTPException
4
+ from fastapi.responses import JSONResponse
5
+ from pydantic import BaseModel
6
+ from model.main import process_and_analyze
7
+ from upload.data import download_pdf_from_url, process_uploaded_file
8
+ import shutil
9
+
10
+ app = FastAPI(title="Agentic Document Extraction", description="API for extracting and analyzing PDF documents")
11
+
12
+ class URLInput(BaseModel):
13
+ url: str
14
+
15
+ @app.post("/upload")
16
+ async def upload_file(file: UploadFile = File(...)):
17
+ if not file.filename.endswith('.pdf'):
18
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed!")
19
+
20
+ # Save uploaded file temporarily
21
+ file_path = f"/tmp/{file.filename}"
22
+ try:
23
+ with open(file_path, "wb") as buffer:
24
+ shutil.copyfileobj(file.file, buffer)
25
+
26
+ # Process the uploaded file
27
+ process_uploaded_file(file_path)
28
+
29
+ if os.path.exists(file_path):
30
+ process_and_analyze(file_path)
31
+ return JSONResponse(content={"message": "Processing successful! Check files in data-extractor and file-upload."})
32
+ else:
33
+ raise HTTPException(status_code=500, detail="Error processing file!")
34
+ except Exception as e:
35
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
36
+ finally:
37
+ # Clean up temporary file
38
+ if os.path.exists(file_path):
39
+ os.remove(file_path)
40
+
41
+ @app.post("/process-url")
42
+ async def process_url(input: URLInput):
43
+ if not input.url:
44
+ raise HTTPException(status_code=400, detail="URL is required!")
45
+
46
+ try:
47
+ file_path = download_pdf_from_url(input.url)
48
+ if file_path and os.path.exists(file_path):
49
+ process_and_analyze(file_path)
50
+ return JSONResponse(content={"message": "Processing successful! Check files in data-extractor and file-upload."})
51
+ else:
52
+ raise HTTPException(status_code=500, detail="Error processing file from URL!")
53
+ except Exception as e:
54
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
55
+
56
+ if __name__ == "__main__":
57
+ uvicorn.run(app, host="0.0.0.0", port=8000)
dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+ RUN python -m spacy download en_core_web_sm
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 8000
12
+
13
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
model/main.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import pipeline
3
+ from upload.data import process_uploaded_file, extract_text_from_pdf, extract_key_info
4
+
5
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
6
+ chatbot = pipeline("conversational", model="microsoft/DialoGPT-medium")
7
+
8
+ def summarize_text(text: str) -> str:
9
+ summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
10
+ return summary[0]['summary_text']
11
+
12
+ def chat_with_document(text: str, user_input: str) -> str:
13
+ from transformers import Conversation
14
+ conversation = Conversation(user_input)
15
+ response = chatbot([conversation], text_context=text)
16
+ return response.generated_responses[-1]
17
+
18
+ def process_and_analyze(file_path: str):
19
+ data = process_uploaded_file(file_path)
20
+ text = extract_text_from_pdf(file_path)
21
+
22
+ summary = summarize_text(text)
23
+ print(f"Tóm tắt: {summary}")
24
+
25
+ while True:
26
+ user_input = input("Nhập câu hỏi (hoặc 'exit' để thoát): ")
27
+ if user_input.lower() == 'exit':
28
+ break
29
+ response = chat_with_document(text, user_input)
30
+ print(f"Trả lời: {response}")
requirements.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ blis==1.3.0
5
+ catalogue==2.0.10
6
+ certifi==2025.4.26
7
+ charset-normalizer==3.4.2
8
+ click==8.2.1
9
+ cloudpathlib==0.21.1
10
+ confection==0.1.5
11
+ cymem==2.0.11
12
+ exceptiongroup==1.3.0
13
+ fastapi==0.115.12
14
+ ffmpy==0.6.0
15
+ filelock==3.18.0
16
+ fsspec==2025.5.1
17
+ gradio==5.32.1
18
+ gradio_client==1.10.2
19
+ groovy==0.1.2
20
+ h11==0.16.0
21
+ hf-xet==1.1.2
22
+ httpcore==1.0.9
23
+ httpx==0.28.1
24
+ huggingface-hub==0.32.3
25
+ idna==3.10
26
+ Jinja2==3.1.6
27
+ langcodes==3.5.0
28
+ language_data==1.3.0
29
+ marisa-trie==1.2.1
30
+ markdown-it-py==3.0.0
31
+ MarkupSafe==3.0.2
32
+ mdurl==0.1.2
33
+ mpmath==1.3.0
34
+ murmurhash==1.0.13
35
+ networkx==3.4.2
36
+ numpy==1.26.4
37
+ orjson==3.10.18
38
+ packaging==25.0
39
+ pandas==2.2.3
40
+ pillow==11.2.1
41
+ preshed==3.0.10
42
+ pydantic==2.11.5
43
+ pydantic_core==2.33.2
44
+ pydub==0.25.1
45
+ Pygments==2.19.1
46
+ PyPDF2==3.0.1
47
+ python-dateutil==2.9.0.post0
48
+ python-multipart==0.0.20
49
+ pytz==2025.2
50
+ PyYAML==6.0.2
51
+ regex==2024.11.6
52
+ requests==2.32.3
53
+ rich==14.0.0
54
+ ruff==0.11.12
55
+ safehttpx==0.1.6
56
+ safetensors==0.5.3
57
+ semantic-version==2.10.0
58
+ shellingham==1.5.4
59
+ six==1.17.0
60
+ smart-open==7.1.0
61
+ sniffio==1.3.1
62
+ spacy==3.8.7
63
+ spacy-legacy==3.0.12
64
+ spacy-loggers==1.0.5
65
+ srsly==2.5.1
66
+ starlette==0.46.2
67
+ sympy==1.14.0
68
+ thinc==8.3.6
69
+ tokenizers==0.21.1
70
+ tomlkit==0.13.2
71
+ torch==2.2.2
72
+ tqdm==4.67.1
73
+ transformers==4.52.4
74
+ typer==0.16.0
75
+ typing-inspection==0.4.1
76
+ typing_extensions==4.14.0
77
+ tzdata==2025.2
78
+ urllib3==2.4.0
79
+ uvicorn==0.34.3
80
+ wasabi==1.1.3
81
+ weasel==0.4.1
82
+ websockets==15.0.1
83
+ wrapt==1.17.2
84
+ fastapi==0.115.0
85
+ uvicorn==0.30.6
86
+ gradio==4.44.0
87
+ pydantic==2.9.2
upload/data.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import PyPDF2
4
+ import spacy
5
+ import requests
6
+ from typing import Dict, List
7
+
8
+ # Tải mô hình spaCy
9
+ nlp = spacy.load("en_core_web_sm")
10
+
11
+ def download_pdf_from_url(url: str, temp_dir: str = "temp") -> str:
12
+ if not os.path.exists(temp_dir):
13
+ os.makedirs(temp_dir)
14
+ file_name = url.split("/")[-1]
15
+ if not file_name.endswith(".pdf"):
16
+ file_name += ".pdf"
17
+ file_path = os.path.join(temp_dir, file_name)
18
+
19
+ response = requests.get(url, stream=True)
20
+ if response.status_code == 200:
21
+ with open(file_path, 'wb') as f:
22
+ for chunk in response.iter_content(1024):
23
+ f.write(chunk)
24
+ return file_path
25
+ else:
26
+ raise Exception("Không thể tải file từ URL")
27
+
28
+ def extract_text_from_pdf(pdf_path: str) -> str:
29
+ with open(pdf_path, 'rb') as file:
30
+ reader = PyPDF2.PdfReader(file)
31
+ text = ""
32
+ for page in reader.pages:
33
+ text += page.extract_text() or ""
34
+ return text
35
+
36
+ def extract_key_info(text: str) -> Dict:
37
+ doc = nlp(text)
38
+ patient_info = {}
39
+ diagnosis = []
40
+
41
+ for ent in doc.ents:
42
+ if ent.label_ == "PERSON":
43
+ patient_info["Patient"] = ent.text
44
+ elif ent.label_ == "DATE":
45
+ patient_info["Date"] = ent.text
46
+
47
+ if "DIAGNOSIS" in text:
48
+ start_idx = text.index("DIAGNOSIS") + len("DIAGNOSIS")
49
+ diag_text = text[start_idx:].split("\n")[0].strip()
50
+ diagnosis.append(diag_text)
51
+
52
+ return {"patient_info": patient_info, "diagnosis": diagnosis}
53
+
54
+ def to_markdown(data: Dict) -> str:
55
+ markdown = "# Patient Information\n"
56
+ for key, value in data["patient_info"].items():
57
+ markdown += f"- {key}: {value}\n"
58
+ markdown += "# Diagnosis\n"
59
+ for diag in data["diagnosis"]:
60
+ markdown += f"- {diag}\n"
61
+ return markdown
62
+
63
+ def to_json(data: Dict) -> str:
64
+ import json
65
+ return json.dumps(data, indent=2)
66
+
67
+ def process_uploaded_file(file_path: str, upload_dir: str = "file-upload", output_dir: str = "data-extractor"):
68
+ if not os.path.exists(upload_dir):
69
+ os.makedirs(upload_dir)
70
+ shutil.copy(file_path, upload_dir)
71
+ uploaded_file_path = os.path.join(upload_dir, os.path.basename(file_path))
72
+
73
+ text = extract_text_from_pdf(uploaded_file_path)
74
+ data = extract_key_info(text)
75
+
76
+ if not os.path.exists(output_dir):
77
+ os.makedirs(output_dir)
78
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
79
+ with open(os.path.join(output_dir, f"{base_name}.md"), "w", encoding="utf-8") as md_file:
80
+ md_file.write(to_markdown(data))
81
+ with open(os.path.join(output_dir, f"{base_name}.json"), "w", encoding="utf-8") as json_file:
82
+ json_file.write(to_json(data))
83
+
84
+ return data