testing / app.py
mostfa98's picture
add PDF parser app
0be189e
import os
import gradio as gr
import pandas as pd
import semchunk
from pathlib import Path
from docling.document_converter import DocumentConverter
from transformers import AutoTokenizer
CHUNK_SIZE = 512
EMBED_MODEL = "abhinand/MedEmbed-base-v0.1"
OUTPUT_CSV = "/tmp/dataset.csv"
print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
print("Tokenizer ready.")
def token_counter(text: str) -> int:
return len(tokenizer.encode(text, add_special_tokens=False))
def process_pdfs(pdf_files, chunk_size):
if not pdf_files:
return None, None, "⚠️ Please upload at least one PDF."
chunk_size = int(chunk_size)
converter = DocumentConverter()
chunker = semchunk.chunkerify(token_counter, chunk_size=chunk_size)
log_lines = []
raw_texts = []
log_lines.append(f"Found {len(pdf_files)} PDF(s). Parsing…")
for pdf_path in pdf_files:
name = Path(pdf_path).name
log_lines.append(f" Parsing: {name}")
result = converter.convert(pdf_path)
text = result.document.export_to_markdown()
raw_texts.append(text)
log_lines.append(f"\nParsed {len(raw_texts)} document(s). Chunking…")
all_chunks = []
for i, text in enumerate(raw_texts):
chunks = chunker(text)
all_chunks.extend(chunks)
log_lines.append(f" Document {i+1}: {len(chunks)} chunks")
log_lines.append(f"\nTotal chunks: {len(all_chunks)}")
df = pd.DataFrame({
"text_input": all_chunks,
"icd-10": None,
"sbs": None,
"sfda": None,
"denial-code": None,
})
df.to_csv(OUTPUT_CSV, index=False)
log_lines.append(f"Saved → {OUTPUT_CSV}")
return OUTPUT_CSV, df.head(20), "\n".join(log_lines)
with gr.Blocks(title="PDF Parser") as demo:
gr.Markdown("## PDF Parser\nUpload PDFs → parse with Docling → chunk → download CSV")
with gr.Row():
pdf_input = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")
chunk_size = gr.Slider(128, 1024, value=512, step=64, label="Chunk size (tokens)")
run_btn = gr.Button("Parse & Chunk", variant="primary")
with gr.Row():
log_out = gr.Textbox(label="Log", lines=12, interactive=False)
csv_out = gr.File(label="Download dataset.csv")
table_out = gr.Dataframe(label="Preview (first 20 rows)", wrap=True)
run_btn.click(
process_pdfs,
inputs=[pdf_input, chunk_size],
outputs=[csv_out, table_out, log_out],
)
demo.launch(server_name="0.0.0.0", server_port=7860)