Spaces:

sejalkishan
/

doc-sum

Build error

App Files Files Community

sejalkishan commited on Jul 12, 2025

Commit

d8c1543

verified ·

1 Parent(s): 87c367f

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -31

app.py CHANGED Viewed

@@ -1,25 +1,43 @@
 import gradio as gr
 import pdfplumber
 import docx
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login
 import torch
 import os
-import spaces  # Needed for @spaces.GPU decorator
-# 🔐 Authenticate using Hugging Face token stored as secret in the Space
-login(token=os.environ.get("token"))
-# ✅ Optional GPU logging
-if torch.cuda.is_available():
-    print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
-else:
-    print("⚠️ Running on CPU (not recommended).")
-# 🔧 Model details
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-# 📄 Extract text from PDF
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
@@ -27,6 +45,13 @@ def extract_text_from_pdf(file):
             page_text = page.extract_text()
             if page_text:
                 text += page_text + "\n"
     return text
 # 📄 Extract text from DOCX
@@ -34,7 +59,7 @@ def extract_text_from_docx(file):
     doc = docx.Document(file)
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
-# 🧩 Chunk large text into ~6000 character chunks
 def chunk_text(text, max_chars=6000):
     paragraphs = text.split("\n")
     chunks, current_chunk = [], ""
@@ -48,7 +73,7 @@ def chunk_text(text, max_chars=6000):
         chunks.append(current_chunk)
     return chunks
-# 🧠 Prompt to extract key info
 def create_prompt(text_chunk):
     return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
 1. Total manpower required
@@ -61,8 +86,7 @@ CONTENT:
 {text_chunk}
 """
-# 🧠 GPU-decorated main function — forces GPU allocation during processing
-@spaces.GPU(duration=300)  # up to 10 minutes GPU time
 def analyze_document(file):
     filename = file.name
     ext = os.path.splitext(filename)[-1].lower()
@@ -77,35 +101,21 @@ def analyze_document(file):
     if len(raw_text.strip()) == 0:
         return "❌ No text found in the document."
-    # Load model and tokenizer INSIDE this GPU-decorated function
-    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",                  # Auto GPU assignment
-        torch_dtype=torch.float16,          # Optimized for GPU
-        use_auth_token=True,
-        trust_remote_code=True
-    )
-    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
     chunks = chunk_text(raw_text)
-    full_summary = ""
     for chunk in chunks:
         prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=512, do_sample=False, temperature=0.5)[0]["generated_text"]
         answer = result.split("CONTENT:")[-1].strip()
         full_summary += answer + "\n\n---\n\n"
-    # Optional: Clear GPU memory after use
-    torch.cuda.empty_cache()
     return full_summary
 # 🎨 Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("## 📄 Smart Document Analyzer – Tender & Technical Documents (GPU-Enhanced)")
-    gr.Markdown("Upload a PDF or DOCX file. The app extracts manpower, timeline, technical needs, and budget details using a large LLM.")
     with gr.Row():
         file_input = gr.File(label="📎 Upload PDF or Word Document")

 import gradio as gr
 import pdfplumber
 import docx
+import easyocr
+import numpy as np
+from PIL import Image
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login
 import torch
 import os
+# 🔐 Authenticate using Hugging Face token (if needed for gated repos)
+# If you're using public models, this can be commented out.
+if os.environ.get("HF_TOKEN"):
+    login(token=os.environ["HF_TOKEN"])
+# 🚀 Check if GPU is available
+if not torch.cuda.is_available():
+    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
+print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
+# 🔧 Load model
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    use_auth_token=True,
+    trust_remote_code=True
+)
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+# 🧠 Load EasyOCR
+reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
+# 📄 Extract text from PDF with OCR fallback
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
             page_text = page.extract_text()
             if page_text:
                 text += page_text + "\n"
+            else:
+                image = page.to_image(resolution=300).original.convert("RGB")
+                image_np = np.array(image)
+                ocr_result = reader.readtext(image_np, detail=0)
+                ocr_text = "\n".join(ocr_result)
+                if ocr_text.strip():
+                    text += ocr_text + "\n"
     return text
 # 📄 Extract text from DOCX
     doc = docx.Document(file)
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
+# 🧩 Chunk text for LLM processing
 def chunk_text(text, max_chars=6000):
     paragraphs = text.split("\n")
     chunks, current_chunk = [], ""
         chunks.append(current_chunk)
     return chunks
+# 🧠 LLM Prompt Template
 def create_prompt(text_chunk):
     return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
 1. Total manpower required
 {text_chunk}
 """
+# 🔍 Main handler
 def analyze_document(file):
     filename = file.name
     ext = os.path.splitext(filename)[-1].lower()
     if len(raw_text.strip()) == 0:
         return "❌ No text found in the document."
     chunks = chunk_text(raw_text)
+    full_summary = ""
     for chunk in chunks:
         prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=512, do_sample=False, temperature=0.5)[0]["generated_text"]
         answer = result.split("CONTENT:")[-1].strip()
         full_summary += answer + "\n\n---\n\n"
     return full_summary
 # 🎨 Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("## 📄 Smart Document Analyzer – Tender & Technical Documents (GPU-Powered, OCR-Ready)")
+    gr.Markdown("Upload a PDF or DOCX file. This tool extracts manpower, timeline, technical needs, and budget using a powerful LLM with OCR support for scanned PDFs.")
     with gr.Row():
         file_input = gr.File(label="📎 Upload PDF or Word Document")