File size: 2,608 Bytes
0be189e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import gradio as gr
import pandas as pd
import semchunk
from pathlib import Path
from docling.document_converter import DocumentConverter
from transformers import AutoTokenizer

CHUNK_SIZE  = 512
EMBED_MODEL = "abhinand/MedEmbed-base-v0.1"
OUTPUT_CSV  = "/tmp/dataset.csv"

print("Loading tokenizer…")
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
print("Tokenizer ready.")


def token_counter(text: str) -> int:
    return len(tokenizer.encode(text, add_special_tokens=False))


def process_pdfs(pdf_files, chunk_size):
    if not pdf_files:
        return None, None, "⚠️ Please upload at least one PDF."

    chunk_size = int(chunk_size)
    converter  = DocumentConverter()
    chunker    = semchunk.chunkerify(token_counter, chunk_size=chunk_size)

    log_lines  = []
    raw_texts  = []

    log_lines.append(f"Found {len(pdf_files)} PDF(s). Parsing…")
    for pdf_path in pdf_files:
        name = Path(pdf_path).name
        log_lines.append(f"  Parsing: {name}")
        result = converter.convert(pdf_path)
        text   = result.document.export_to_markdown()
        raw_texts.append(text)

    log_lines.append(f"\nParsed {len(raw_texts)} document(s). Chunking…")
    all_chunks = []
    for i, text in enumerate(raw_texts):
        chunks = chunker(text)
        all_chunks.extend(chunks)
        log_lines.append(f"  Document {i+1}: {len(chunks)} chunks")

    log_lines.append(f"\nTotal chunks: {len(all_chunks)}")

    df = pd.DataFrame({
        "text_input":  all_chunks,
        "icd-10":      None,
        "sbs":         None,
        "sfda":        None,
        "denial-code": None,
    })

    df.to_csv(OUTPUT_CSV, index=False)
    log_lines.append(f"Saved → {OUTPUT_CSV}")

    return OUTPUT_CSV, df.head(20), "\n".join(log_lines)


with gr.Blocks(title="PDF Parser") as demo:
    gr.Markdown("## PDF Parser\nUpload PDFs → parse with Docling → chunk → download CSV")

    with gr.Row():
        pdf_input  = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")
        chunk_size = gr.Slider(128, 1024, value=512, step=64, label="Chunk size (tokens)")

    run_btn = gr.Button("Parse & Chunk", variant="primary")

    with gr.Row():
        log_out = gr.Textbox(label="Log", lines=12, interactive=False)
        csv_out = gr.File(label="Download dataset.csv")

    table_out = gr.Dataframe(label="Preview (first 20 rows)", wrap=True)

    run_btn.click(
        process_pdfs,
        inputs=[pdf_input, chunk_size],
        outputs=[csv_out, table_out, log_out],
    )

demo.launch(server_name="0.0.0.0", server_port=7860)