Spaces:

Toulik
/

MagicFile

Sleeping

App Files Files Community

Toulik commited on Sep 19, 2025

Commit

b83f80e

verified ·

1 Parent(s): 9b904c4

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -63

app.py CHANGED Viewed

@@ -1,70 +1,223 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
     demo.launch()

+# app.py
+import os
+import json
+import tempfile
+import datetime
+import re
+from typing import List, Dict, Any
 import gradio as gr
+from PIL import Image
+import fitz  # PyMuPDF
+import pytesseract
+from pdf2image import convert_from_path
+import openai
+# Read OpenAI key from environment (Hugging Face Spaces secrets)
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to Secrets in the HF Space.")
+openai.api_key = OPENAI_API_KEY
+# Model config
+LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # change if you use a different model id
+EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")  # optional
+# ----------------------
+# Text extraction utils
+# ----------------------
+def extract_text_from_pdf(path: str) -> str:
+    """
+    Try text extraction with PyMuPDF; if a page is image-only, fallback to OCR for that page.
+    """
+    doc = fitz.open(path)
+    texts: List[str] = []
+    for i in range(len(doc)):
+        page = doc.load_page(i)
+        txt = page.get_text("text").strip()
+        if txt:
+            texts.append(txt)
+        else:
+            # fallback to render page and OCR
+            pix = page.get_pixmap(dpi=200)
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                pix.save(tmp.name)
+                ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
+                texts.append(ocr_text)
+    return "\n\n".join(texts).strip()
+def extract_text_from_image(path: str) -> str:
+    img = Image.open(path).convert("RGB")
+    return pytesseract.image_to_string(img).strip()
+# ----------------------
+# Simple chunker
+# ----------------------
+def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
+    paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
+    chunks: List[str] = []
+    current = ""
+    for p in paragraphs:
+        if len(current) + len(p) + 2 <= max_chars:
+            current = (current + "\n\n" + p) if current else p
+        else:
+            if current:
+                chunks.append(current)
+            current = p
+    if current:
+        chunks.append(current)
+    return chunks
+# ----------------------
+# LLM call (strict JSON output requested)
+# ----------------------
+def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -> Dict[str, Any]:
     """
+    Prompts GPT-5 to return a strict JSON object with fields matching the user's schema.
+    The prompt asks the model to output machine-parseable JSON only.
     """
+    # Build prompt
+    prompt = (
+        "You are an automated document taxonomy and tagging assistant for enterprise catalogs.\n\n"
+        f"Document title: {title}\n\n"
+        f"Short document text (first ~1000 chars): {short_text}\n\n"
+        "Top content chunks (short):\n"
+    )
+    for i, c in enumerate(top_chunks[:6]):
+        prompt += f"CHUNK_{i+1}: {c[:800].replace('\\n',' ')}\n\n"
+    prompt += (
+        "Task: Produce a single JSON object (machine parseable) with EXACT keys:\n"
+        "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
+        "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
+        "Guidelines:\n"
+        "- summary: 1-2 sentences summarizing the doc.\n"
+        "- doc_type: short enum-like string (e.g., architecture_comparison, whitepaper, design_doc)\n"
+        "- tags: up to 8 short tags like arch:docai, topic:ocr-parsing\n"
+        "- tag_confidences: map with floats 0-1 for each tag\n"
+        "- taxonomy_path: hierarchical list, e.g. [\"Technology\",\"Document Processing\",\"OCR & Parsing\"]\n"
+        "- extracted_entities: map with keys like platforms, tools (each is an array)\n"
+        "- raw_url: if not available, return an empty string\n"
+        "- ingest_timestamp: ISO8601 with timezone (e.g., 2025-09-19T09:13:00+05:30)\n\n"
+        "OUTPUT: ONLY THE JSON OBJECT. DO NOT PROVIDE ANY ADDITIONAL TEXT.\n"
+    )
+    response = openai.ChatCompletion.create(
+        model=LLM_MODEL,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.0,
+        max_tokens=1000,
+    )
+    text = response["choices"][0]["message"]["content"].strip()
+    # Try to extract JSON object from the reply
+    m = re.search(r"\{[\s\S]*\}$", text)
+    json_text = m.group(0) if m else text
+    try:
+        data = json.loads(json_text)
+    except Exception:
+        # If parse fails, return an error structure so UI can show the raw output
+        data = {"_parsing_error": True, "raw_output": text}
+    return data
+# ----------------------
+# Main processing function
+# ----------------------
+def process_file(file_obj) -> Dict[str, Any]:
+    """
+    file_obj: the uploaded file object provided by Gradio; has .name and a .file-like interface
+    Returns metadata dict ready to display.
+    """
+    # Save uploaded file to temporary path
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_obj.name)[1]) as tmp:
+        tmp.write(file_obj.read())
+        tmp_path = tmp.name
+    # Extract text
+    try:
+        if file_obj.name.lower().endswith(".pdf"):
+            extracted_text = extract_text_from_pdf(tmp_path)
+        else:
+            extracted_text = extract_text_from_image(tmp_path)
+    except Exception as e:
+        return {"error": f"Text extraction failed: {e}"}
+    if not extracted_text:
+        return {"error": "No text found in document after extraction."}
+    # Chunk and pick top chunks
+    chunks = chunk_text(extracted_text)
+    # Heuristic: pick longest chunks as representative
+    sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
+    top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
+    # Prepare a "short_text" to feed to the LLM
+    short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
+    # Call LLM
+    metadata = call_gpt5_for_metadata(file_obj.name, short_text, top_chunks)
+    # If LLM returned a parsing error, include it
+    if metadata.get("_parsing_error"):
+        return {
+            "error": "LLM output parsing failed. See raw_output.",
+            "raw_output": metadata.get("raw_output")
+        }
+    # Ensure required keys exist and post-process small things
+    now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
+    metadata.setdefault("doc_id", os.path.splitext(file_obj.name)[0])
+    metadata.setdefault("title", file_obj.name)
+    metadata.setdefault("source", "user_upload")
+    metadata.setdefault("raw_url", "")
+    metadata.setdefault("ingest_timestamp", now)
+    return metadata
+# ----------------------
+# Gradio UI
+# ----------------------
+with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
+    gr.Markdown("## 📂 Upload a PDF or Image — the app will classify, tag, and propose a taxonomy using GPT-5")
+    with gr.Row():
+        with gr.Column(scale=1):
+            uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
+            run_button = gr.Button("Process document")
+            status = gr.Textbox(label="Status", value="", interactive=False)
+            download_button = gr.File(label="Download metadata JSON", visible=False)
+        with gr.Column(scale=1):
+            output_json = gr.JSON(label="Document metadata (JSON)")
+    def on_process(file_obj):
+        status.value = "Processing..."
+        try:
+            result = process_file(file_obj)
+        except Exception as e:
+            status.value = f"Failed: {e}"
+            return gr.update(value={}), gr.update(value="Failed: " + str(e)), None
+        if result.get("error"):
+            status.value = f"Error: {result.get('error')}"
+            # if raw_output provided, show under JSON
+            return gr.update(value={"error": result.get("error"), "raw_output": result.get("raw_output", "")}), gr.update(value=status.value), None
+        status.value = "Done"
+        # create a temp json file for download
+        tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+        with open(tmpf.name, "w", encoding="utf8") as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+        # gr.File expects a path - return tuple (label, path) or file object depending on gradio version
+        return gr.update(value=result), gr.update(value="Done"), tmpf.name
+    run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button])
 if __name__ == "__main__":
     demo.launch()