Spaces:

sejalkishan
/

doc-sum

Build error

App Files Files Community

sejalkishan commited on Jul 12, 2025

Commit

da8f4ee

verified ·

1 Parent(s): 61b6f1c

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -60

app.py CHANGED Viewed

@@ -8,19 +8,20 @@ import pytesseract
 import torch
 import os
 import spaces
-# 🔐 Authenticate Hugging Face token
 login(token=os.environ.get("token"))
-# ✅ Ensure GPU is available
 if not torch.cuda.is_available():
     raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
 print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
-# 🧠 Model
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-# 📄 Document extractors
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
@@ -51,76 +52,82 @@ def chunk_text(text, max_chars=6000):
         chunks.append(current_chunk)
     return chunks
-# 🧾 Q&A Prompt Template
 def create_prompt(text_chunk):
     return f"""
 You are an expert in analyzing U.S. government tender documents. Based only on the content provided below, answer the following 20 standard questions in Q&A format. If something is not mentioned, write "Not mentioned in the provided document."
 CONTENT:
 {text_chunk}
 Now provide answers for:
-Q1: What is the general scope of the tender?
-Q2: Are certifications like SAM, CMMI, ISO, SBA, 8(a), or GSA required?
-Q3: Is there a Set-aside status (e.g., 8a, SDVOSB)?
-Q4: Are U.S. citizens or security-cleared staff required?
-Q5: What is the expected team size or key qualifications?
-Q6: Are offshore resources allowed?
-Q7: What is the mode of working (On-site/Remote/Hybrid)?
-Q8: Is presence in specific regions/states required?
-Q9: Is the delivery location defined?
-Q10: Is remote or offshore delivery allowed?
-Q11: Is a U.S. office presence required?
-Q12: Are travel/lodging expenses reimbursable?
-Q13: Are cybersecurity frameworks (FedRAMP, NIST, HIPAA) required?
-Q14: Are background checks or security clearance needed?
-Q15: Is past experience required?
-Q16: How many references are required?
-Q17: Are only U.S. references accepted?
-Q18: Is private sector experience allowed?
-Q19: Do references need to be identified?
-Q20: Is subcontracting permitted?
 ...
 """
-# 🧼 Cleaner
 def clean_output(raw_output):
-    # Find first valid Q1
-    start_idx = raw_output.find("Q1:")
-    if start_idx == -1:
-        return raw_output.strip()
-    cleaned = raw_output[start_idx:]
-    # Remove everything after second instance of Q1 (to drop repeated prompts)
-    second_q1 = cleaned.find("Q1:", 3)  # skip first one
-    if second_q1 != -1:
-        cleaned = cleaned[:second_q1]
-    # Drop leftover instructions if they show up later
-    cut_phrases = ["You are an expert", "Now provide answers", "CONTENT:", "Answer clearly and in the same format:"]
-    for phrase in cut_phrases:
-        if phrase in cleaned:
-            cleaned = cleaned.split(phrase)[0]
-    return cleaned.strip()
-# 🚀 Main analysis function
 @spaces.GPU(duration=150)
-def analyze_document(file, cancel_flag):
-    ext = os.path.splitext(file.name)[-1].lower()
     if ext == ".pdf":
         raw_text = extract_text_from_pdf(file)
     elif ext == ".docx":
         raw_text = extract_text_from_docx(file)
     else:
-        return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format"
     if len(raw_text.strip()) == 0:
-        return "❌ No text found in the document.", "❌ Empty document"
     chunks = chunk_text(raw_text)
     full_summary = ""
@@ -136,16 +143,17 @@ def analyze_document(file, cancel_flag):
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
     for i, chunk in enumerate(chunks):
-        if cancel_flag:
-            return "⛔ Analysis cancelled by user.", "⛔ Terminated by user"
-        status_msg = f"🔄 Processing chunk {i+1} of {len(chunks)}..."
         prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
-        cleaned = clean_output(result)
-        full_summary += cleaned + "\n\n---\n\n"
-    return full_summary.strip(), "✅ Completed"
 # 🌐 Gradio Interface
 with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
@@ -157,17 +165,17 @@ with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
             with gr.Row():
                 analyze_button = gr.Button("🔍 Analyze", variant="primary")
                 terminate_button = gr.Button("❌ Terminate", variant="stop")
-            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting for input...", interactive=False)
         with gr.Column(scale=2):
             output_box = gr.Textbox(label="🧠 Extracted Tender Intelligence", lines=30, interactive=False)
-    cancel_flag = gr.State(False)
     analyze_button.click(
         fn=analyze_document,
-        inputs=[file_input, cancel_flag],
-        outputs=[output_box, status_box]
     )
     terminate_button.click(

 import torch
 import os
 import spaces
+import re
+# 🔐 Hugging Face authentication
 login(token=os.environ.get("token"))
+# ✅ Check GPU availability
 if not torch.cuda.is_available():
     raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
 print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
+# 🧠 Model ID
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+# 📄 Extractor for PDF (with OCR) and DOCX
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         chunks.append(current_chunk)
     return chunks
+# 🧾 Prompt optimized for 20 Q&A
 def create_prompt(text_chunk):
     return f"""
 You are an expert in analyzing U.S. government tender documents. Based only on the content provided below, answer the following 20 standard questions in Q&A format. If something is not mentioned, write "Not mentioned in the provided document."
 CONTENT:
 {text_chunk}
 Now provide answers for:
+1. What is the general scope of the tender?
+2. Are certifications like SAM, CMMI, ISO, SBA, 8(a), or GSA required?
+3. Is there a Set-aside status (e.g., 8a, SDVOSB)?
+4. Are U.S. citizens or security-cleared staff required?
+5. What is the expected team size or key qualifications?
+6. Are offshore resources allowed?
+7. What is the mode of working (On-site/Remote/Hybrid)?
+8. Is presence in specific regions/states required?
+9. Is the delivery location defined?
+10. Is remote or offshore delivery allowed?
+11. Is a U.S. office presence required?
+12. Are travel/lodging expenses reimbursable?
+13. Are cybersecurity frameworks (FedRAMP, NIST, HIPAA) required?
+14. Are background checks or security clearance needed?
+15. Is past experience required?
+16. How many references are required?
+17. Are only U.S. references accepted?
+18. Is private sector experience allowed?
+19. Do references need to be identified?
+20. Is subcontracting permitted?
+Answer in this format:
+Q1: ...
+A1: ...
+Q2: ...
+A2: ...
 ...
 """
+# ✅ Clean model output to remove repeated prompt content
 def clean_output(raw_output):
+    lines = raw_output.splitlines()
+    cleaned = []
+    capture = False
+    for line in lines:
+        if line.strip().startswith("Q1:"):
+            capture = True
+        if capture:
+            cleaned.append(line)
+    text = "\n".join(cleaned)
+    # Remove any repeated question block after A20
+    if "Q20:" in text:
+        text = text.split("Q20:")[0] + "Q20: Is subcontracting permitted?"
+    # Trim content after A20 if any
+    match = re.search(r"(A20:.*?)\n", text, re.DOTALL)
+    if match:
+        end = match.end()
+        text = text[:end].strip()
+    return text.strip()
+# 🔍 GPU-enabled analyzer
 @spaces.GPU(duration=150)
+def analyze_document(file, status_text, cancel_flag):
+    filename = file.name
+    ext = os.path.splitext(filename)[-1].lower()
     if ext == ".pdf":
         raw_text = extract_text_from_pdf(file)
     elif ext == ".docx":
         raw_text = extract_text_from_docx(file)
     else:
+        return "❌ Unsupported file format. Please upload a PDF or DOCX."
     if len(raw_text.strip()) == 0:
+        return "❌ No text found in the document."
     chunks = chunk_text(raw_text)
     full_summary = ""
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
     for i, chunk in enumerate(chunks):
+        if cancel_flag.value:
+            return "⛔ Analysis cancelled by user."
+        status_text.value = f"🔄 Processing chunk {i+1} of {len(chunks)}..."
         prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
+        answer = clean_output(result)
+        full_summary += answer + "\n\n---\n\n"
+    status_text.value = "✅ Completed"
+    return full_summary.strip()
 # 🌐 Gradio Interface
 with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
             with gr.Row():
                 analyze_button = gr.Button("🔍 Analyze", variant="primary")
                 terminate_button = gr.Button("❌ Terminate", variant="stop")
+            status_text = gr.Textbox(label="📊 Status", value="⏳ Waiting for input...", interactive=False)
         with gr.Column(scale=2):
             output_box = gr.Textbox(label="🧠 Extracted Tender Intelligence", lines=30, interactive=False)
+    cancel_flag = gr.State(value=False)
     analyze_button.click(
         fn=analyze_document,
+        inputs=[file_input, status_text, cancel_flag],
+        outputs=output_box
     )
     terminate_button.click(