Spaces:

perfectPresentation
/

testing

Paused

App Files Files Community

mostfa98 commited on about 19 hours ago

Commit

0be189e

1 Parent(s): e68026c

add PDF parser app

Browse files

Files changed (3) hide show

Dockerfile +27 -0
app.py +85 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
+WORKDIR /app
+# System dependencies needed by Docling / PDF processing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3-pip \
+    python3.11-dev \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    poppler-utils \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.11 /usr/bin/python \
+    && ln -sf /usr/bin/pip3 /usr/bin/pip
+# Install Python dependencies first (layer cache)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy app
+COPY app.py .
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import gradio as gr
+import pandas as pd
+import semchunk
+from pathlib import Path
+from docling.document_converter import DocumentConverter
+from transformers import AutoTokenizer
+CHUNK_SIZE  = 512
+EMBED_MODEL = "abhinand/MedEmbed-base-v0.1"
+OUTPUT_CSV  = "/tmp/dataset.csv"
+print("Loading tokenizer…")
+tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
+print("Tokenizer ready.")
+def token_counter(text: str) -> int:
+    return len(tokenizer.encode(text, add_special_tokens=False))
+def process_pdfs(pdf_files, chunk_size):
+    if not pdf_files:
+        return None, None, "⚠️ Please upload at least one PDF."
+    chunk_size = int(chunk_size)
+    converter  = DocumentConverter()
+    chunker    = semchunk.chunkerify(token_counter, chunk_size=chunk_size)
+    log_lines  = []
+    raw_texts  = []
+    log_lines.append(f"Found {len(pdf_files)} PDF(s). Parsing…")
+    for pdf_path in pdf_files:
+        name = Path(pdf_path).name
+        log_lines.append(f"  Parsing: {name}")
+        result = converter.convert(pdf_path)
+        text   = result.document.export_to_markdown()
+        raw_texts.append(text)
+    log_lines.append(f"\nParsed {len(raw_texts)} document(s). Chunking…")
+    all_chunks = []
+    for i, text in enumerate(raw_texts):
+        chunks = chunker(text)
+        all_chunks.extend(chunks)
+        log_lines.append(f"  Document {i+1}: {len(chunks)} chunks")
+    log_lines.append(f"\nTotal chunks: {len(all_chunks)}")
+    df = pd.DataFrame({
+        "text_input":  all_chunks,
+        "icd-10":      None,
+        "sbs":         None,
+        "sfda":        None,
+        "denial-code": None,
+    })
+    df.to_csv(OUTPUT_CSV, index=False)
+    log_lines.append(f"Saved → {OUTPUT_CSV}")
+    return OUTPUT_CSV, df.head(20), "\n".join(log_lines)
+with gr.Blocks(title="PDF Parser") as demo:
+    gr.Markdown("## PDF Parser\nUpload PDFs → parse with Docling → chunk → download CSV")
+    with gr.Row():
+        pdf_input  = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")
+        chunk_size = gr.Slider(128, 1024, value=512, step=64, label="Chunk size (tokens)")
+    run_btn = gr.Button("Parse & Chunk", variant="primary")
+    with gr.Row():
+        log_out = gr.Textbox(label="Log", lines=12, interactive=False)
+        csv_out = gr.File(label="Download dataset.csv")
+    table_out = gr.Dataframe(label="Preview (first 20 rows)", wrap=True)
+    run_btn.click(
+        process_pdfs,
+        inputs=[pdf_input, chunk_size],
+        outputs=[csv_out, table_out, log_out],
+    )
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.0.0
+docling
+semchunk
+transformers
+pandas
+torch --index-url https://download.pytorch.org/whl/cu121