import os import gradio as gr import pandas as pd import semchunk from pathlib import Path from docling.document_converter import DocumentConverter from transformers import AutoTokenizer CHUNK_SIZE = 512 EMBED_MODEL = "abhinand/MedEmbed-base-v0.1" OUTPUT_CSV = "/tmp/dataset.csv" print("Loading tokenizer…") tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL) print("Tokenizer ready.") def token_counter(text: str) -> int: return len(tokenizer.encode(text, add_special_tokens=False)) def process_pdfs(pdf_files, chunk_size): if not pdf_files: return None, None, "⚠️ Please upload at least one PDF." chunk_size = int(chunk_size) converter = DocumentConverter() chunker = semchunk.chunkerify(token_counter, chunk_size=chunk_size) log_lines = [] raw_texts = [] log_lines.append(f"Found {len(pdf_files)} PDF(s). Parsing…") for pdf_path in pdf_files: name = Path(pdf_path).name log_lines.append(f" Parsing: {name}") result = converter.convert(pdf_path) text = result.document.export_to_markdown() raw_texts.append(text) log_lines.append(f"\nParsed {len(raw_texts)} document(s). Chunking…") all_chunks = [] for i, text in enumerate(raw_texts): chunks = chunker(text) all_chunks.extend(chunks) log_lines.append(f" Document {i+1}: {len(chunks)} chunks") log_lines.append(f"\nTotal chunks: {len(all_chunks)}") df = pd.DataFrame({ "text_input": all_chunks, "icd-10": None, "sbs": None, "sfda": None, "denial-code": None, }) df.to_csv(OUTPUT_CSV, index=False) log_lines.append(f"Saved → {OUTPUT_CSV}") return OUTPUT_CSV, df.head(20), "\n".join(log_lines) with gr.Blocks(title="PDF Parser") as demo: gr.Markdown("## PDF Parser\nUpload PDFs → parse with Docling → chunk → download CSV") with gr.Row(): pdf_input = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple") chunk_size = gr.Slider(128, 1024, value=512, step=64, label="Chunk size (tokens)") run_btn = gr.Button("Parse & Chunk", variant="primary") with gr.Row(): log_out = gr.Textbox(label="Log", lines=12, interactive=False) csv_out = gr.File(label="Download dataset.csv") table_out = gr.Dataframe(label="Preview (first 20 rows)", wrap=True) run_btn.click( process_pdfs, inputs=[pdf_input, chunk_size], outputs=[csv_out, table_out, log_out], ) demo.launch(server_name="0.0.0.0", server_port=7860)