Spaces:

Omarrran
/

Analyse_Proposal

Sleeping

App Files Files Community

Omarrran commited on Apr 8, 2025

Commit

c72b167

verified ·

1 Parent(s): 5c06b65

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -70

app.py CHANGED Viewed

@@ -22,22 +22,21 @@ logging.basicConfig(
 )
 logger = logging.getLogger("pdf_processor")
-# Attempt to import Unstructured.io partitioning
 try:
     from unstructured.partition.pdf import partition_pdf
     UNSTRUCTURED_AVAILABLE = True
 except ImportError:
     UNSTRUCTURED_AVAILABLE = False
-    logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
-# Load API key from environment (set this in your Space's Secrets as GOOGLE_API_KEY)
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if API_KEY:
     genai.configure(api_key=API_KEY)
 else:
     logger.warning("GOOGLE_API_KEY not set in environment.")
-# Globals to store state
 EXTRACTED_TEXT = ""
 PDF_SECTIONS = []
 EXTRACTION_METHOD = ""
@@ -45,21 +44,25 @@ EXTRACTION_METHOD = ""
 # --- Extraction Functions ---
 def extract_text_with_unstructured(pdf_path):
-    logger.info("Extracting via Unstructured.io...")
-    elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
-    sections, current = [], {"title": "Introduction", "content": ""}
-    for e in elements:
-        if hasattr(e, "text") and (t := e.text.strip()):
-            # Section header heuristic
-            if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
-                if current["content"]:
-                    sections.append(current)
-                current = {"title": t, "content": ""}
-            else:
-                current["content"] += t + "\n\n"
-    if current["content"]:
-        sections.append(current)
-    return sections
 def extract_text_with_pypdf(pdf_path):
@@ -76,7 +79,6 @@ def extract_text_with_pypdf(pdf_path):
             {"title": parts[i].strip(), "content": parts[i + 1].strip()}
             for i in range(1, len(parts), 2)
         ]
-    # fallback single section
     return [{"title": "Document", "content": full_text}]
@@ -100,7 +102,7 @@ def extract_text_with_tika(pdf_path):
     return sections
-# --- Gemini API calls ---
 def generate_greg_brockman_summary(content):
     model = genai.GenerativeModel("gemini-1.5-pro")
     prompt = f"""
@@ -110,14 +112,14 @@ You are an expert document analyst specializing in proposal evaluation.
 1. GOAL: ...
 ... (rest of template) ...
-CONTENT TO ANALYZE:
 {content}
 """
     try:
         resp = model.generate_content(prompt)
         return resp.text, None
     except Exception as e:
-        logger.error(f"Summary generation error: {e}")
         return None, str(e)
@@ -135,11 +137,11 @@ QUESTION: {question}
         resp = model.generate_content(prompt)
         return resp.text, None
     except Exception as e:
-        logger.error(f"Q&A generation error: {e}")
         return None, str(e)
-# --- Processing & Q&A Handlers ---
 def process_pdf(pdf_file, progress=gr.Progress()):
     global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
@@ -148,13 +150,25 @@ def process_pdf(pdf_file, progress=gr.Progress()):
     if pdf_file is None:
         return None, None, "❌ No file uploaded.", ""
-    # Save to temp
     tmp_dir = tempfile.gettempdir()
-    path = os.path.join(tmp_dir, pdf_file.name)
-    with open(path, "wb") as f:
-        f.write(pdf_file.read())
-    # Choose methods
     methods = []
     if UNSTRUCTURED_AVAILABLE:
         methods.append(("unstructured", extract_text_with_unstructured))
@@ -164,6 +178,7 @@ def process_pdf(pdf_file, progress=gr.Progress()):
     ]
     sections = None
     for name, fn in methods:
         try:
             secs = fn(path)
@@ -172,45 +187,37 @@ def process_pdf(pdf_file, progress=gr.Progress()):
                 EXTRACTION_METHOD = name
                 break
         except Exception as e:
-            logger.warning(f"{name} failed: {e}")
     if not sections:
-        return None, None, "❌ Extraction failed.", ""
-    # Combine & store
-    combined = ""
-    structure = ""
-    for idx, sec in enumerate(sections, start=1):
-        structure += f"{idx}. {sec['title']}\n"
         chunk = f"## {sec['title']}\n{sec['content']}\n\n"
-        if len(combined) + len(chunk) < 30000:
-            combined += chunk
-        else:
-            combined += f"## {sec['title']}\n[Truncated]\n\n"
-            structure += "   [Content truncated]\n"
     EXTRACTED_TEXT = combined
     PDF_SECTIONS = sections
-    # Generate summary
     summary, err = generate_greg_brockman_summary(combined)
     if err:
         return None, structure, f"❌ {err}", combined
-    return summary, structure, "✅ PDF processed successfully", f"Used {EXTRACTION_METHOD}."
 def ask_question(question):
     if not API_KEY:
         return "❌ Set GOOGLE_API_KEY in Secrets."
     if not EXTRACTED_TEXT:
-        return "❌ Please upload & process a PDF first."
     if not question.strip():
         return "❌ Enter a question."
-    answer, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
-    if err:
-        return f"❌ {err}"
-    return answer
 def view_log():
     try:
@@ -218,7 +225,6 @@ def view_log():
     except Exception as e:
         return f"Error reading log: {e}"
 def save_summary(summary):
     if not summary:
         return "❌ No summary to save."
@@ -227,7 +233,6 @@ def save_summary(summary):
         f.write(summary)
     return f"✅ Saved to {fn}"
 def save_qa(question, answer):
     if not question or not answer:
         return "❌ Nothing to save."
@@ -243,28 +248,16 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
     gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
     with gr.Tab("Setup"):
-        with gr.Row():
-            api_key_input = gr.Textbox(
-                label="Google Gemini API Key",
-                type="password",
-                placeholder="Set in Secrets (GOOGLE_API_KEY)"
-            )
-            api_button = gr.Button("Configure API")
-        api_status = gr.Markdown("⚠️ Using environment GOOGLE_API_KEY")
-        api_button.click(
-            fn=lambda key: (genai.configure(api_key=key) or "✅ API configured", None),
-            inputs=[api_key_input],
-            outputs=[api_status, gr.State()]
-        )
     with gr.Tab("PDF Processing"):
         with gr.Row():
             pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
             proc_btn = gr.Button("Process PDF", variant="primary")
-            status = gr.Markdown("Awaiting upload...")
         summary_out = gr.Textbox(label="Summary", lines=15)
         structure_out = gr.Textbox(label="Structure", lines=8)
-        log_info = gr.Textbox(label="Internal Log", lines=5)
         proc_btn.click(
             fn=process_pdf,
             inputs=[pdf_file],
@@ -289,5 +282,4 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
         refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
 if __name__ == "__main__":
-    # On Hugging Face Spaces, share=True isn't needed; server_name="0.0.0.0" ensures external access
     app.launch(server_name="0.0.0.0")

 )
 logger = logging.getLogger("pdf_processor")
+# Try Unstructured.io
 try:
     from unstructured.partition.pdf import partition_pdf
     UNSTRUCTURED_AVAILABLE = True
 except ImportError:
     UNSTRUCTURED_AVAILABLE = False
+    logger.warning("unstructured.partition.pdf not available; skipping that method")
+# Load Gemini API key from env (set in your Space Secrets)
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if API_KEY:
     genai.configure(api_key=API_KEY)
 else:
     logger.warning("GOOGLE_API_KEY not set in environment.")
 EXTRACTED_TEXT = ""
 PDF_SECTIONS = []
 EXTRACTION_METHOD = ""
 # --- Extraction Functions ---
 def extract_text_with_unstructured(pdf_path):
+    try:
+        logger.info("Extracting via Unstructured.io...")
+        elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
+        sections, current = [], {"title": "Introduction", "content": ""}
+        for e in elements:
+            if hasattr(e, "text") and (t := e.text.strip()):
+                if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
+                    if current["content"]:
+                        sections.append(current)
+                    current = {"title": t, "content": ""}
+                else:
+                    current["content"] += t + "\n\n"
+        if current["content"]:
+            sections.append(current)
+        return sections
+    except Exception as e:
+        # Bubble up so process_pdf can catch & log
+        logger.error(f"Unstructured extraction error: {e}", exc_info=True)
+        raise
 def extract_text_with_pypdf(pdf_path):
             {"title": parts[i].strip(), "content": parts[i + 1].strip()}
             for i in range(1, len(parts), 2)
         ]
     return [{"title": "Document", "content": full_text}]
     return sections
+# --- Gemini calls ---
 def generate_greg_brockman_summary(content):
     model = genai.GenerativeModel("gemini-1.5-pro")
     prompt = f"""
 1. GOAL: ...
 ... (rest of template) ...
+CONTENT:
 {content}
 """
     try:
         resp = model.generate_content(prompt)
         return resp.text, None
     except Exception as e:
+        logger.error(f"Summary error: {e}")
         return None, str(e)
         resp = model.generate_content(prompt)
         return resp.text, None
     except Exception as e:
+        logger.error(f"Q&A error: {e}")
         return None, str(e)
+# --- Handlers ---
 def process_pdf(pdf_file, progress=gr.Progress()):
     global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
     if pdf_file is None:
         return None, None, "❌ No file uploaded.", ""
+    # Determine path & write bytes if needed
     tmp_dir = tempfile.gettempdir()
+    # Case 1: NamedString (in‐memory) with .name & .data
+    if hasattr(pdf_file, "name") and hasattr(pdf_file, "data"):
+        path = os.path.join(tmp_dir, pdf_file.name)
+        with open(path, "wb") as f:
+            f.write(pdf_file.data)
+    # Case 2: direct filepath (str)
+    elif isinstance(pdf_file, str):
+        path = pdf_file
+    # Case 3: file‐like with .read()
+    elif hasattr(pdf_file, "read"):
+        path = os.path.join(tmp_dir, getattr(pdf_file, "name", "uploaded.pdf"))
+        with open(path, "wb") as f:
+            f.write(pdf_file.read())
+    else:
+        return None, None, "❌ Unrecognized upload type", ""
+    # Try methods in order
     methods = []
     if UNSTRUCTURED_AVAILABLE:
         methods.append(("unstructured", extract_text_with_unstructured))
     ]
     sections = None
+    last_err = ""
     for name, fn in methods:
         try:
             secs = fn(path)
                 EXTRACTION_METHOD = name
                 break
         except Exception as e:
+            last_err = f"{name} failed: {e}"
+            logger.warning(last_err)
     if not sections:
+        return None, None, "❌ Extraction failed", last_err
+    # Combine & summarize
+    combined, structure = "", ""
+    for i, sec in enumerate(sections, 1):
+        structure += f"{i}. {sec['title']}\n"
         chunk = f"## {sec['title']}\n{sec['content']}\n\n"
+        combined += chunk if len(combined + chunk) < 30000 else f"## {sec['title']}\n[Truncated]\n\n"
     EXTRACTED_TEXT = combined
     PDF_SECTIONS = sections
     summary, err = generate_greg_brockman_summary(combined)
     if err:
         return None, structure, f"❌ {err}", combined
+    return summary, structure, "✅ PDF processed", f"Used {EXTRACTION_METHOD}"
 def ask_question(question):
     if not API_KEY:
         return "❌ Set GOOGLE_API_KEY in Secrets."
     if not EXTRACTED_TEXT:
+        return "❌ Process a PDF first."
     if not question.strip():
         return "❌ Enter a question."
+    ans, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
+    return ans if not err else f"❌ {err}"
 def view_log():
     try:
     except Exception as e:
         return f"Error reading log: {e}"
 def save_summary(summary):
     if not summary:
         return "❌ No summary to save."
         f.write(summary)
     return f"✅ Saved to {fn}"
 def save_qa(question, answer):
     if not question or not answer:
         return "❌ Nothing to save."
     gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
     with gr.Tab("Setup"):
+        gr.Markdown("⚠️ Make sure `GOOGLE_API_KEY` is set in your Space's Secrets.")
     with gr.Tab("PDF Processing"):
         with gr.Row():
             pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
             proc_btn = gr.Button("Process PDF", variant="primary")
+            status = gr.Markdown("Awaiting upload…")
         summary_out = gr.Textbox(label="Summary", lines=15)
         structure_out = gr.Textbox(label="Structure", lines=8)
+        log_info    = gr.Textbox(label="Internal Log", lines=5)
         proc_btn.click(
             fn=process_pdf,
             inputs=[pdf_file],
         refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
 if __name__ == "__main__":
     app.launch(server_name="0.0.0.0")