Spaces:

BIBLETUM
/

ResumeTweaker

Sleeping

App Files Files Community

BIBLETUM commited on Nov 28, 2025

Commit

5a246b8

verified ·

1 Parent(s): f6c6193

Create app.py

Browse files

Files changed (1) hide show

app.py +304 -0

app.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import os
+import re
+import numpy as np
+import gradio as gr
+import soundfile as sf
+from PIL import Image
+import fitz  # PyMuPDF
+import torch
+from transformers import (
+    pipeline,
+    DonutProcessor,
+    VisionEncoderDecoderModel,
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+)
+from sentence_transformers import SentenceTransformer
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", "openai/whisper-tiny")
+DONUT_MODEL = os.getenv("DONUT_MODEL", "naver-clova-ix/donut-base-finetuned-docvqa")
+T5_MODEL = os.getenv("T5_MODEL", "google/flan-t5-small")
+EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+MAX_PAGES = int(os.getenv("MAX_PAGES", "2"))
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1600"))  # px
+TOPK = int(os.getenv("TOPK", "5"))
+# ---------- Models ----------
+asr = pipeline(
+    task="automatic-speech-recognition",
+    model=WHISPER_MODEL,
+    device=0 if DEVICE == "cuda" else -1,
+)
+donut_processor = DonutProcessor.from_pretrained(DONUT_MODEL)
+donut_model = VisionEncoderDecoderModel.from_pretrained(DONUT_MODEL).to(DEVICE)
+donut_model.eval()
+t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)
+t5_model = AutoModelForSeq2SeqLM.from_pretrained(T5_MODEL).to(DEVICE)
+t5_model.eval()
+embedder = SentenceTransformer(EMB_MODEL, device=DEVICE)
+# ---------- Utils ----------
+def _resize_max(im: Image.Image, max_side: int) -> Image.Image:
+    w, h = im.size
+    m = max(w, h)
+    if m <= max_side:
+        return im
+    scale = max_side / float(m)
+    nw, nh = max(1, int(w * scale)), max(1, int(h * scale))
+    return im.resize((nw, nh), Image.BICUBIC)
+def load_document_to_images(file_path: str, max_pages: int = MAX_PAGES) -> list[Image.Image]:
+    if not file_path:
+        return []
+    ext = (os.path.splitext(file_path)[1] or "").lower()
+    if ext in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"]:
+        im = Image.open(file_path).convert("RGB")
+        return [_resize_max(im, MAX_IMAGE_SIZE)]
+    if ext == ".pdf":
+        doc = fitz.open(file_path)
+        imgs = []
+        pages = min(len(doc), max_pages)
+        for i in range(pages):
+            page = doc.load_page(i)
+            pix = page.get_pixmap(alpha=False)
+            im = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            imgs.append(_resize_max(im, MAX_IMAGE_SIZE))
+        doc.close()
+        return imgs
+    return []
+def donut_docvqa(image: Image.Image, question: str, max_new_tokens: int = 64) -> str:
+    if image is None or not (question or "").strip():
+        return ""
+    q = question.strip()
+    prompt = f"<s_docvqa><s_question>{q}</s_question><s_answer>"
+    inputs = donut_processor(image, prompt, return_tensors="pt")
+    pixel_values = inputs.pixel_values.to(DEVICE, dtype=DTYPE)
+    decoder_input_ids = inputs.decoder_input_ids.to(DEVICE)
+    with torch.inference_mode():
+        out = donut_model.generate(
+            pixel_values=pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=donut_processor.tokenizer.pad_token_id,
+            eos_token_id=donut_processor.tokenizer.eos_token_id,
+            bad_words_ids=[[donut_processor.tokenizer.unk_token_id]],
+        )
+    text = donut_processor.batch_decode(out, skip_special_tokens=True)[0]
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def t5_summarize(text: str, max_new_tokens: int = 128) -> str:
+    t = (text or "").strip()
+    if not t:
+        return ""
+    prompt = f"Summarize this document briefly:\n{t}"
+    inputs = t5_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
+    with torch.inference_mode():
+        out = t5_model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            num_beams=2,
+        )
+    return t5_tokenizer.decode(out[0], skip_special_tokens=True).strip()
+def embed_text(text: str) -> np.ndarray:
+    v = embedder.encode([text or ""], normalize_embeddings=True)[0]
+    return np.asarray(v, dtype=np.float32)
+def cos_sim_matrix(query_vec: np.ndarray, mat: np.ndarray) -> np.ndarray:
+    # vectors already normalized -> dot is cosine
+    return mat @ query_vec
+def format_kv(items: list[tuple[str, str]]) -> str:
+    lines = []
+    for k, v in items:
+        v = (v or "").strip()
+        if v:
+            lines.append(f"{k}: {v}")
+    return "\n".join(lines).strip()
+# ---------- App State ----------
+# archive_state: list[dict] where dict contains:
+# { "name": str, "text": str, "vec": np.ndarray }
+def ensure_state(archive_state):
+    if archive_state is None:
+        return []
+    return archive_state
+# ---------- Actions ----------
+DEFAULT_FIELDS = [
+    ("amount", "What is the total amount to pay? Return only the amount."),
+    ("due_date", "What is the due date? Return only the date."),
+    ("period", "What is the billing period?"),
+    ("recipient", "Who is the recipient/payee?"),
+    ("account", "What is the account / invoice number?"),
+]
+def act_extract(file_obj, pages, archive_state):
+    archive_state = ensure_state(archive_state)
+    if not file_obj:
+        return None, None, "", "", archive_state
+    images = load_document_to_images(file_obj, max_pages=MAX_PAGES)
+    if not images:
+        return None, None, "", "", archive_state
+    first = images[0]
+    answers = []
+    for name, q in DEFAULT_FIELDS:
+        a = donut_docvqa(first, q)
+        answers.append((name, a))
+    extracted = format_kv(answers)
+    summary = t5_summarize(extracted)
+    page_gallery = images
+    return first, page_gallery, extracted, summary, archive_state
+def act_ask(file_obj, question, use_audio_text, audio_text):
+    q = (question or "").strip()
+    if use_audio_text and (audio_text or "").strip():
+        q = (audio_text or "").strip()
+    if not file_obj or not q:
+        return ""
+    images = load_document_to_images(file_obj, max_pages=MAX_PAGES)
+    if not images:
+        return ""
+    return donut_docvqa(images[0], q)
+def act_transcribe(audio_path):
+    if not audio_path:
+        return ""
+    data, sr = sf.read(audio_path)
+    if data.ndim > 1:
+        data = data.mean(axis=1)
+    out = asr({"raw": data, "sampling_rate": sr})
+    if isinstance(out, dict) and "text" in out:
+        return (out["text"] or "").strip()
+    return str(out).strip()
+def act_add_to_archive(file_obj, extracted, summary, archive_state):
+    archive_state = ensure_state(archive_state)
+    if not file_obj:
+        return archive_state, "0"
+    name = os.path.basename(file_obj)
+    payload = "\n".join([t for t in [name, extracted or "", summary or ""] if (t or "").strip()]).strip()
+    if not payload:
+        payload = name
+    vec = embed_text(payload)
+    archive_state.append({"name": name, "text": payload, "vec": vec})
+    return archive_state, str(len(archive_state))
+def act_search_archive(query, archive_state):
+    archive_state = ensure_state(archive_state)
+    q = (query or "").strip()
+    if not q or not archive_state:
+        return ""
+    qv = embed_text(q)
+    mat = np.vstack([it["vec"] for it in archive_state]).astype(np.float32)
+    sims = cos_sim_matrix(qv, mat)
+    idx = np.argsort(-sims)[: min(TOPK, len(archive_state))]
+    lines = []
+    for rank, i in enumerate(idx, start=1):
+        it = archive_state[int(i)]
+        s = float(sims[int(i)])
+        lines.append(f"{rank}. [{s:.3f}] {it['name']}\n{it['text'][:600]}")
+    return "\n\n".join(lines).strip()
+# ---------- UI ----------
+with gr.Blocks(title="DocuVoice Assistant (MVP)") as demo:
+    archive_state = gr.State([])
+    with gr.Row():
+        file_in = gr.File(label="PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"])
+    with gr.Tabs():
+        with gr.Tab("Document"):
+            with gr.Row():
+                btn_extract = gr.Button("Extract + Summarize", variant="primary")
+                btn_add = gr.Button("Add to Archive")
+            with gr.Row():
+                img_preview = gr.Image(label="Preview (page 1)", type="pil")
+                pages_gallery = gr.Gallery(label="Pages", columns=3, height=280, preview=True)
+            with gr.Row():
+                extracted_out = gr.Textbox(label="Extracted (Donut Q&A)", lines=8)
+                summary_out = gr.Textbox(label="Summary (T5)", lines=8)
+            with gr.Row():
+                question_in = gr.Textbox(label="Question", lines=2, placeholder="Ask about the document...")
+            with gr.Row():
+                use_audio = gr.Checkbox(label="Use transcribed audio as question", value=False)
+            with gr.Row():
+                btn_ask = gr.Button("Ask (Donut DocVQA)")
+            answer_out = gr.Textbox(label="Answer", lines=6)
+            with gr.Row():
+                archive_count = gr.Textbox(label="Archive size", value="0", interactive=False)
+        with gr.Tab("Voice"):
+            audio_in = gr.Audio(label="Audio", sources=["microphone", "upload"], type="filepath")
+            btn_asr = gr.Button("Transcribe (Whisper)", variant="primary")
+            transcript_out = gr.Textbox(label="Transcript", lines=4)
+            btn_asr.click(act_transcribe, inputs=[audio_in], outputs=[transcript_out])
+        with gr.Tab("Archive"):
+            query_in = gr.Textbox(label="Search query", lines=2, placeholder="e.g., electricity bill October")
+            btn_search = gr.Button("Search (Embeddings)")
+            results_out = gr.Textbox(label="Results", lines=16)
+            btn_search.click(act_search_archive, inputs=[query_in, archive_state], outputs=[results_out])
+    btn_extract.click(
+        act_extract,
+        inputs=[file_in, pages_gallery, archive_state],
+        outputs=[img_preview, pages_gallery, extracted_out, summary_out, archive_state],
+    )
+    btn_ask.click(
+        act_ask,
+        inputs=[file_in, question_in, use_audio, transcript_out],
+        outputs=[answer_out],
+    )
+    btn_add.click(
+        act_add_to_archive,
+        inputs=[file_in, extracted_out, summary_out, archive_state],
+        outputs=[archive_state, archive_count],
+    )
+if __name__ == "__main__":
+    demo.launch()