Spaces:

Toulik
/

MagicFile

Sleeping

App Files Files Community

Toulik commited on Sep 19, 2025

Commit

7e8c7d5

verified ·

1 Parent(s): 95e19a1

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -62

app.py CHANGED Viewed

@@ -10,18 +10,23 @@ import gradio as gr
 from PIL import Image
 import fitz  # PyMuPDF
 import pytesseract
-from pdf2image import convert_from_path
-import openai
-# Read OpenAI key from environment (Hugging Face Spaces secrets)
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to Secrets in the HF Space.")
-openai.api_key = OPENAI_API_KEY
-# Model config
-LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # change if you use a different model id
 EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")  # optional
 # ----------------------
@@ -29,9 +34,13 @@ EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
 # ----------------------
 def extract_text_from_pdf(path: str) -> str:
     """
-    Try text extraction with PyMuPDF; if a page is image-only, fallback to OCR for that page.
     """
-    doc = fitz.open(path)
     texts: List[str] = []
     for i in range(len(doc)):
         page = doc.load_page(i)
@@ -54,7 +63,7 @@ def extract_text_from_image(path: str) -> str:
 # ----------------------
-# Simple chunker
 # ----------------------
 def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
     paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
@@ -73,24 +82,26 @@ def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
 # ----------------------
-# LLM call (strict JSON output requested)
 # ----------------------
 def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -> Dict[str, Any]:
     """
-    Prompts GPT-5 to return a strict JSON object with fields matching the user's schema.
-    The prompt asks the model to output machine-parseable JSON only.
     """
-    # Build prompt
-    prompt = (
         "You are an automated document taxonomy and tagging assistant for enterprise catalogs.\n\n"
         f"Document title: {title}\n\n"
         f"Short document text (first ~1000 chars): {short_text}\n\n"
         "Top content chunks (short):\n"
     )
     for i, c in enumerate(top_chunks[:6]):
-        prompt += "CHUNK_{}: {}\n\n".format(i+1, c[:800].replace("\n", " "))
-    prompt += (
         "Task: Produce a single JSON object (machine parseable) with EXACT keys:\n"
         "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
         "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
@@ -106,44 +117,71 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -
         "OUTPUT: ONLY THE JSON OBJECT. DO NOT PROVIDE ANY ADDITIONAL TEXT.\n"
     )
-    response = openai.ChatCompletion.create(
-        model=LLM_MODEL,
-        messages=[{"role": "user", "content": prompt}],
-        temperature=0.0,
-        max_tokens=1000,
-    )
-    text = response["choices"][0]["message"]["content"].strip()
-    # Try to extract JSON object from the reply
     m = re.search(r"\{[\s\S]*\}$", text)
     json_text = m.group(0) if m else text
     try:
         data = json.loads(json_text)
     except Exception:
-        # If parse fails, return an error structure so UI can show the raw output
         data = {"_parsing_error": True, "raw_output": text}
     return data
-# helper: accept multiple upload types and return saved temp path and original name
 def save_uploaded_to_tmp(file_obj):
     """
-    Accepts:
-      - a file-like object with .read()
-      - a path string (existing file path)
-      - a dict-like object returned by some gradio versions: {"name": "...", "data": b'...'}
-      - a NamedTemporaryFile wrapper (sometimes behaves like a path string)
     Returns (tmp_path, original_name)
     """
-    import io
     # Case 1: file-like object with .read()
     if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
         try:
             content = file_obj.read()
-            # some wrappers return str, ensure bytes
             if isinstance(content, str):
                 content = content.encode("utf-8")
             name = getattr(file_obj, "name", "uploaded_file")
@@ -152,12 +190,10 @@ def save_uploaded_to_tmp(file_obj):
                 tmp.write(content)
                 return tmp.name, os.path.basename(name)
         except Exception:
-            # fallthrough to other handlers
             pass
-    # Case 2: Gradio sometimes returns a dict-like object with 'name' and 'data'
     if isinstance(file_obj, dict):
-        # some versions: {"name": "foo.pdf", "data": b'...'}
         if "data" in file_obj and "name" in file_obj:
             data = file_obj["data"]
             if isinstance(data, str):
@@ -170,10 +206,8 @@ def save_uploaded_to_tmp(file_obj):
     # Case 3: file_obj is a path string
     if isinstance(file_obj, str):
-        # if it's an existing path, just return it
         if os.path.exists(file_obj):
             return file_obj, os.path.basename(file_obj)
-        # sometimes gradio passes a NamedString that can be opened as a path -- try to open it
         try:
             with open(file_obj, "rb") as f:
                 data = f.read()
@@ -184,7 +218,7 @@ def save_uploaded_to_tmp(file_obj):
         except Exception:
             pass
-    # Case 4: some wrappers expose .name but not .read (e.g., NamedString)
     name = getattr(file_obj, "name", None)
     if name and isinstance(name, str):
         try:
@@ -197,25 +231,20 @@ def save_uploaded_to_tmp(file_obj):
         except Exception:
             pass
-    # If we reach here, we can't handle the object
-    raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. Value: {str(file_obj)[:200]}")
-# ----------------------
-# Main processing function
-# ----------------------
-# Updated process_file using the helper above
 def process_file(file_obj) -> Dict[str, Any]:
     """
-    file_obj: whatever gradio handed to us (file-like, dict, path string, etc.)
-    Returns metadata dict ready to display.
     """
     try:
         tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
     except Exception as e:
         return {"error": f"Failed to save uploaded file: {e}"}
-    # Now use tmp_path and orig_name for the rest of the pipeline
     try:
         if orig_name.lower().endswith(".pdf"):
             extracted_text = extract_text_from_pdf(tmp_path)
@@ -234,14 +263,16 @@ def process_file(file_obj) -> Dict[str, Any]:
     short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
     metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks)
     if metadata.get("_parsing_error"):
-        return {
-            "error": "LLM output parsing failed. See raw_output.",
-            "raw_output": metadata.get("raw_output")
-        }
     now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
     metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
     metadata.setdefault("title", orig_name)
@@ -271,23 +302,20 @@ with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
         try:
             result = process_file(file_obj)
         except Exception as e:
-            status.value = f"Failed: {e}"
-            return gr.update(value={}), gr.update(value="Failed: " + str(e)), None
         if result.get("error"):
-            status.value = f"Error: {result.get('error')}"
-            # if raw_output provided, show under JSON
-            return gr.update(value={"error": result.get("error"), "raw_output": result.get("raw_output", "")}), gr.update(value=status.value), None
-        status.value = "Done"
         # create a temp json file for download
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
             json.dump(result, f, indent=2, ensure_ascii=False)
-        # gr.File expects a path - return tuple (label, path) or file object depending on gradio version
         return gr.update(value=result), gr.update(value="Done"), tmpf.name
     run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button])
 if __name__ == "__main__":
     demo.launch()

 from PIL import Image
 import fitz  # PyMuPDF
 import pytesseract
+# pdf2image is optional here, we used PyMuPDF for PDF -> image rendering fallback
+# from pdf2image import convert_from_path
+# OpenAI new client
+from openai import OpenAI
+# -----------------------
+# Configuration / Client
+# -----------------------
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to Secrets in HF Space or set env var.")
+# Create the new OpenAI client (new API surface for openai>=1.0.0)
+client = OpenAI(api_key=OPENAI_API_KEY)
+LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # change to your available model id if needed
 EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")  # optional
 # ----------------------
 # ----------------------
 def extract_text_from_pdf(path: str) -> str:
     """
+    Extract text using PyMuPDF. If a page has no extractable text, render to image and OCR with pytesseract.
     """
+    try:
+        doc = fitz.open(path)
+    except Exception as e:
+        raise RuntimeError(f"Failed to open PDF: {e}")
     texts: List[str] = []
     for i in range(len(doc)):
         page = doc.load_page(i)
 # ----------------------
+# Chunker
 # ----------------------
 def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
     paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
 # ----------------------
+# OpenAI LLM & embeddings helpers (new client surface)
 # ----------------------
 def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str]) -> Dict[str, Any]:
     """
+    Prompt GPT-5 to return a single JSON object matching the schema the user specified.
+    We ask the model to return JSON only. We do a best-effort parse and return structured dict.
     """
+    prompt_intro = (
         "You are an automated document taxonomy and tagging assistant for enterprise catalogs.\n\n"
         f"Document title: {title}\n\n"
         f"Short document text (first ~1000 chars): {short_text}\n\n"
         "Top content chunks (short):\n"
     )
+    prompt_chunks = ""
     for i, c in enumerate(top_chunks[:6]):
+        chunk_text_clean = c[:800].replace("\n", " ")
+        prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
+    prompt_end = (
         "Task: Produce a single JSON object (machine parseable) with EXACT keys:\n"
         "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
         "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
         "OUTPUT: ONLY THE JSON OBJECT. DO NOT PROVIDE ANY ADDITIONAL TEXT.\n"
     )
+    prompt = prompt_intro + prompt_chunks + prompt_end
+    # Call using new client
+    try:
+        resp = client.chat.completions.create(
+            model=LLM_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.0,
+            max_tokens=1000,
+        )
+    except Exception as e:
+        return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
+    # Extract text robustly
+    try:
+        text = resp.choices[0].message["content"].strip()
+    except Exception:
+        # fallback attribute access if response uses attribute objects
+        try:
+            text = resp.choices[0].message.content.strip()
+        except Exception:
+            text = str(resp)
+    # Try to extract JSON block
     m = re.search(r"\{[\s\S]*\}$", text)
     json_text = m.group(0) if m else text
     try:
         data = json.loads(json_text)
     except Exception:
         data = {"_parsing_error": True, "raw_output": text}
     return data
+def get_embeddings_for_chunks(chunks: List[str], model: str = EMBEDDING_MODEL) -> List[List[float]]:
+    try:
+        resp = client.embeddings.create(model=model, input=chunks)
+    except Exception as e:
+        raise RuntimeError(f"Embeddings API call failed: {e}")
+    # resp.data is an array of objects containing .embedding
+    try:
+        return [item.embedding for item in resp.data]
+    except Exception:
+        # fallback to dict-like access
+        return [item["embedding"] for item in resp.data]
+# ----------------------
+# Robust uploader helper + processing
+# ----------------------
 def save_uploaded_to_tmp(file_obj):
     """
+    Accepts multiple upload types commonly returned by gradio:
+      - file-like object with .read()
+      - dict-like {"name": "...", "data": b'...'}
+      - path string (existing file path)
+      - objects with a .name attribute pointing to a saved path (NamedString)
     Returns (tmp_path, original_name)
     """
     # Case 1: file-like object with .read()
     if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
         try:
             content = file_obj.read()
+            # sometimes content may be str
             if isinstance(content, str):
                 content = content.encode("utf-8")
             name = getattr(file_obj, "name", "uploaded_file")
                 tmp.write(content)
                 return tmp.name, os.path.basename(name)
         except Exception:
             pass
+    # Case 2: dict-like returned by some gradio versions
     if isinstance(file_obj, dict):
         if "data" in file_obj and "name" in file_obj:
             data = file_obj["data"]
             if isinstance(data, str):
     # Case 3: file_obj is a path string
     if isinstance(file_obj, str):
         if os.path.exists(file_obj):
             return file_obj, os.path.basename(file_obj)
         try:
             with open(file_obj, "rb") as f:
                 data = f.read()
         except Exception:
             pass
+    # Case 4: object has .name attribute referencing a real path (NamedString)
     name = getattr(file_obj, "name", None)
     if name and isinstance(name, str):
         try:
         except Exception:
             pass
+    raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. Value repr: {repr(file_obj)[:400]}")
 def process_file(file_obj) -> Dict[str, Any]:
     """
+    Orchestrates saving uploaded file, extracting text, chunking, calling LLM and post-processing.
+    Returns: metadata dict or {"error": "..."} on failure.
     """
     try:
         tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
     except Exception as e:
         return {"error": f"Failed to save uploaded file: {e}"}
+    # Extract text
     try:
         if orig_name.lower().endswith(".pdf"):
             extracted_text = extract_text_from_pdf(tmp_path)
     short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
+    # Call LLM to get JSON metadata
     metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks)
+    if metadata.get("_api_error"):
+        return {"error": metadata.get("error")}
     if metadata.get("_parsing_error"):
+        return {"error": "LLM output parsing failed. See raw_output.", "raw_output": metadata.get("raw_output")}
+    # Ensure required keys and add ingestion timestamp if missing
     now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
     metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
     metadata.setdefault("title", orig_name)
         try:
             result = process_file(file_obj)
         except Exception as e:
+            return gr.update(value={}), gr.update(value=f"Failed: {e}"), None
         if result.get("error"):
+            return gr.update(value={"error": result.get("error"), "raw_output": result.get("raw_output", "")}), gr.update(value=f"Error: {result.get('error')}"), None
         # create a temp json file for download
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
             json.dump(result, f, indent=2, ensure_ascii=False)
         return gr.update(value=result), gr.update(value="Done"), tmpf.name
     run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button])
+# Launch
 if __name__ == "__main__":
     demo.launch()