mostfa98 commited on
Commit
0be189e
·
1 Parent(s): e68026c

add PDF parser app

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. app.py +85 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
2
+
3
+ WORKDIR /app
4
+
5
+ # System dependencies needed by Docling / PDF processing
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ python3.11 \
8
+ python3-pip \
9
+ python3.11-dev \
10
+ libgl1-mesa-glx \
11
+ libglib2.0-0 \
12
+ poppler-utils \
13
+ libgomp1 \
14
+ && rm -rf /var/lib/apt/lists/* \
15
+ && ln -sf /usr/bin/python3.11 /usr/bin/python \
16
+ && ln -sf /usr/bin/pip3 /usr/bin/pip
17
+
18
+ # Install Python dependencies first (layer cache)
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Copy app
23
+ COPY app.py .
24
+
25
+ EXPOSE 7860
26
+
27
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import semchunk
5
+ from pathlib import Path
6
+ from docling.document_converter import DocumentConverter
7
+ from transformers import AutoTokenizer
8
+
9
+ CHUNK_SIZE = 512
10
+ EMBED_MODEL = "abhinand/MedEmbed-base-v0.1"
11
+ OUTPUT_CSV = "/tmp/dataset.csv"
12
+
13
+ print("Loading tokenizer…")
14
+ tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
15
+ print("Tokenizer ready.")
16
+
17
+
18
+ def token_counter(text: str) -> int:
19
+ return len(tokenizer.encode(text, add_special_tokens=False))
20
+
21
+
22
+ def process_pdfs(pdf_files, chunk_size):
23
+ if not pdf_files:
24
+ return None, None, "⚠️ Please upload at least one PDF."
25
+
26
+ chunk_size = int(chunk_size)
27
+ converter = DocumentConverter()
28
+ chunker = semchunk.chunkerify(token_counter, chunk_size=chunk_size)
29
+
30
+ log_lines = []
31
+ raw_texts = []
32
+
33
+ log_lines.append(f"Found {len(pdf_files)} PDF(s). Parsing…")
34
+ for pdf_path in pdf_files:
35
+ name = Path(pdf_path).name
36
+ log_lines.append(f" Parsing: {name}")
37
+ result = converter.convert(pdf_path)
38
+ text = result.document.export_to_markdown()
39
+ raw_texts.append(text)
40
+
41
+ log_lines.append(f"\nParsed {len(raw_texts)} document(s). Chunking…")
42
+ all_chunks = []
43
+ for i, text in enumerate(raw_texts):
44
+ chunks = chunker(text)
45
+ all_chunks.extend(chunks)
46
+ log_lines.append(f" Document {i+1}: {len(chunks)} chunks")
47
+
48
+ log_lines.append(f"\nTotal chunks: {len(all_chunks)}")
49
+
50
+ df = pd.DataFrame({
51
+ "text_input": all_chunks,
52
+ "icd-10": None,
53
+ "sbs": None,
54
+ "sfda": None,
55
+ "denial-code": None,
56
+ })
57
+
58
+ df.to_csv(OUTPUT_CSV, index=False)
59
+ log_lines.append(f"Saved → {OUTPUT_CSV}")
60
+
61
+ return OUTPUT_CSV, df.head(20), "\n".join(log_lines)
62
+
63
+
64
+ with gr.Blocks(title="PDF Parser") as demo:
65
+ gr.Markdown("## PDF Parser\nUpload PDFs → parse with Docling → chunk → download CSV")
66
+
67
+ with gr.Row():
68
+ pdf_input = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")
69
+ chunk_size = gr.Slider(128, 1024, value=512, step=64, label="Chunk size (tokens)")
70
+
71
+ run_btn = gr.Button("Parse & Chunk", variant="primary")
72
+
73
+ with gr.Row():
74
+ log_out = gr.Textbox(label="Log", lines=12, interactive=False)
75
+ csv_out = gr.File(label="Download dataset.csv")
76
+
77
+ table_out = gr.Dataframe(label="Preview (first 20 rows)", wrap=True)
78
+
79
+ run_btn.click(
80
+ process_pdfs,
81
+ inputs=[pdf_input, chunk_size],
82
+ outputs=[csv_out, table_out, log_out],
83
+ )
84
+
85
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ docling
3
+ semchunk
4
+ transformers
5
+ pandas
6
+ torch --index-url https://download.pytorch.org/whl/cu121