Spaces:

sejalkishan
/

doc-sum

Build error

App Files Files Community

sejalkishan commited on Jul 12, 2025

Commit

de6872f

verified ·

1 Parent(s): d8c1543

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -31

app.py CHANGED Viewed

@@ -8,36 +8,19 @@ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login
 import torch
 import os
-# 🔐 Authenticate using Hugging Face token (if needed for gated repos)
-# If you're using public models, this can be commented out.
-if os.environ.get("HF_TOKEN"):
-    login(token=os.environ["HF_TOKEN"])
-# 🚀 Check if GPU is available
-if not torch.cuda.is_available():
-    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
-print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
-# 🔧 Load model
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.float16,
-    use_auth_token=True,
-    trust_remote_code=True
-)
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-# 🧠 Load EasyOCR
-reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
-# 📄 Extract text from PDF with OCR fallback
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
@@ -59,7 +42,7 @@ def extract_text_from_docx(file):
     doc = docx.Document(file)
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
-# 🧩 Chunk text for LLM processing
 def chunk_text(text, max_chars=6000):
     paragraphs = text.split("\n")
     chunks, current_chunk = [], ""
@@ -73,7 +56,7 @@ def chunk_text(text, max_chars=6000):
         chunks.append(current_chunk)
     return chunks
-# 🧠 LLM Prompt Template
 def create_prompt(text_chunk):
     return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
 1. Total manpower required
@@ -86,7 +69,8 @@ CONTENT:
 {text_chunk}
 """
-# 🔍 Main handler
 def analyze_document(file):
     filename = file.name
     ext = os.path.splitext(filename)[-1].lower()
@@ -101,8 +85,20 @@ def analyze_document(file):
     if len(raw_text.strip()) == 0:
         return "❌ No text found in the document."
-    chunks = chunk_text(raw_text)
     full_summary = ""
     for chunk in chunks:
         prompt = create_prompt(chunk)
@@ -114,8 +110,8 @@ def analyze_document(file):
 # 🎨 Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("## 📄 Smart Document Analyzer – Tender & Technical Documents (GPU-Powered, OCR-Ready)")
-    gr.Markdown("Upload a PDF or DOCX file. This tool extracts manpower, timeline, technical needs, and budget using a powerful LLM with OCR support for scanned PDFs.")
     with gr.Row():
         file_input = gr.File(label="📎 Upload PDF or Word Document")

 from huggingface_hub import login
 import torch
 import os
+import spaces
+# 🔐 Authenticate if token is provided (for gated models)
+if os.environ.get("token"):
+    login(token=os.environ["token"])
+# 🧠 Load EasyOCR Reader once (outside GPU scope)
+reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
+# 🔧 Static model ID
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+# 📄 Extract text from PDF (supports OCR fallback)
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
     doc = docx.Document(file)
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
+# 🧩 Chunk text into 6000-character parts
 def chunk_text(text, max_chars=6000):
     paragraphs = text.split("\n")
     chunks, current_chunk = [], ""
         chunks.append(current_chunk)
     return chunks
+# 🧠 Prompt template
 def create_prompt(text_chunk):
     return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
 1. Total manpower required
 {text_chunk}
 """
+# 🧠 GPU-decorated main function — forces GPU allocation during processing
+@spaces.GPU(duration=300)  # up to 10 minutes GPU time
 def analyze_document(file):
     filename = file.name
     ext = os.path.splitext(filename)[-1].lower()
     if len(raw_text.strip()) == 0:
         return "❌ No text found in the document."
+    # ✅ Load model and tokenizer INSIDE GPU scope
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        use_auth_token=True,
+        trust_remote_code=True
+    )
+    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    # 🔍 Chunked generation
+    chunks = chunk_text(raw_text)
     full_summary = ""
     for chunk in chunks:
         prompt = create_prompt(chunk)
 # 🎨 Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("## 📄 Smart Document Analyzer – Tender & Technical Docs (GPU + OCR Ready)")
+    gr.Markdown("Upload a PDF (scanned or normal) or Word file. Extract manpower, deadlines, tech needs, and budgets using LLM + OCR.")
     with gr.Row():
         file_input = gr.File(label="📎 Upload PDF or Word Document")