Spaces:
Paused
Paused
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| import semchunk | |
| from pathlib import Path | |
| from docling.document_converter import DocumentConverter | |
| from transformers import AutoTokenizer | |
| CHUNK_SIZE = 512 | |
| EMBED_MODEL = "abhinand/MedEmbed-base-v0.1" | |
| OUTPUT_CSV = "/tmp/dataset.csv" | |
| print("Loading tokenizer…") | |
| tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL) | |
| print("Tokenizer ready.") | |
| def token_counter(text: str) -> int: | |
| return len(tokenizer.encode(text, add_special_tokens=False)) | |
| def process_pdfs(pdf_files, chunk_size): | |
| if not pdf_files: | |
| return None, None, "⚠️ Please upload at least one PDF." | |
| chunk_size = int(chunk_size) | |
| converter = DocumentConverter() | |
| chunker = semchunk.chunkerify(token_counter, chunk_size=chunk_size) | |
| log_lines = [] | |
| raw_texts = [] | |
| log_lines.append(f"Found {len(pdf_files)} PDF(s). Parsing…") | |
| for pdf_path in pdf_files: | |
| name = Path(pdf_path).name | |
| log_lines.append(f" Parsing: {name}") | |
| result = converter.convert(pdf_path) | |
| text = result.document.export_to_markdown() | |
| raw_texts.append(text) | |
| log_lines.append(f"\nParsed {len(raw_texts)} document(s). Chunking…") | |
| all_chunks = [] | |
| for i, text in enumerate(raw_texts): | |
| chunks = chunker(text) | |
| all_chunks.extend(chunks) | |
| log_lines.append(f" Document {i+1}: {len(chunks)} chunks") | |
| log_lines.append(f"\nTotal chunks: {len(all_chunks)}") | |
| df = pd.DataFrame({ | |
| "text_input": all_chunks, | |
| "icd-10": None, | |
| "sbs": None, | |
| "sfda": None, | |
| "denial-code": None, | |
| }) | |
| df.to_csv(OUTPUT_CSV, index=False) | |
| log_lines.append(f"Saved → {OUTPUT_CSV}") | |
| return OUTPUT_CSV, df.head(20), "\n".join(log_lines) | |
| with gr.Blocks(title="PDF Parser") as demo: | |
| gr.Markdown("## PDF Parser\nUpload PDFs → parse with Docling → chunk → download CSV") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple") | |
| chunk_size = gr.Slider(128, 1024, value=512, step=64, label="Chunk size (tokens)") | |
| run_btn = gr.Button("Parse & Chunk", variant="primary") | |
| with gr.Row(): | |
| log_out = gr.Textbox(label="Log", lines=12, interactive=False) | |
| csv_out = gr.File(label="Download dataset.csv") | |
| table_out = gr.Dataframe(label="Preview (first 20 rows)", wrap=True) | |
| run_btn.click( | |
| process_pdfs, | |
| inputs=[pdf_input, chunk_size], | |
| outputs=[csv_out, table_out, log_out], | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |