sejalkishan commited on
Commit
d8c1543
Β·
verified Β·
1 Parent(s): 87c367f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -31
app.py CHANGED
@@ -1,25 +1,43 @@
1
  import gradio as gr
2
  import pdfplumber
3
  import docx
 
 
 
4
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
  from huggingface_hub import login
6
  import torch
7
  import os
8
- import spaces # Needed for @spaces.GPU decorator
9
 
10
- # πŸ” Authenticate using Hugging Face token stored as secret in the Space
11
- login(token=os.environ.get("token"))
 
 
12
 
13
- # βœ… Optional GPU logging
14
- if torch.cuda.is_available():
15
- print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
16
- else:
17
- print("⚠️ Running on CPU (not recommended).")
18
 
19
- # πŸ”§ Model details
 
 
20
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # πŸ“„ Extract text from PDF
23
  def extract_text_from_pdf(file):
24
  text = ""
25
  with pdfplumber.open(file) as pdf:
@@ -27,6 +45,13 @@ def extract_text_from_pdf(file):
27
  page_text = page.extract_text()
28
  if page_text:
29
  text += page_text + "\n"
 
 
 
 
 
 
 
30
  return text
31
 
32
  # πŸ“„ Extract text from DOCX
@@ -34,7 +59,7 @@ def extract_text_from_docx(file):
34
  doc = docx.Document(file)
35
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
36
 
37
- # 🧩 Chunk large text into ~6000 character chunks
38
  def chunk_text(text, max_chars=6000):
39
  paragraphs = text.split("\n")
40
  chunks, current_chunk = [], ""
@@ -48,7 +73,7 @@ def chunk_text(text, max_chars=6000):
48
  chunks.append(current_chunk)
49
  return chunks
50
 
51
- # 🧠 Prompt to extract key info
52
  def create_prompt(text_chunk):
53
  return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
54
  1. Total manpower required
@@ -61,8 +86,7 @@ CONTENT:
61
  {text_chunk}
62
  """
63
 
64
- # 🧠 GPU-decorated main function β€” forces GPU allocation during processing
65
- @spaces.GPU(duration=300) # up to 10 minutes GPU time
66
  def analyze_document(file):
67
  filename = file.name
68
  ext = os.path.splitext(filename)[-1].lower()
@@ -77,35 +101,21 @@ def analyze_document(file):
77
  if len(raw_text.strip()) == 0:
78
  return "❌ No text found in the document."
79
 
80
- # Load model and tokenizer INSIDE this GPU-decorated function
81
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
82
- model = AutoModelForCausalLM.from_pretrained(
83
- model_id,
84
- device_map="auto", # Auto GPU assignment
85
- torch_dtype=torch.float16, # Optimized for GPU
86
- use_auth_token=True,
87
- trust_remote_code=True
88
- )
89
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
90
-
91
  chunks = chunk_text(raw_text)
92
- full_summary = ""
93
 
 
94
  for chunk in chunks:
95
  prompt = create_prompt(chunk)
96
  result = generator(prompt, max_new_tokens=512, do_sample=False, temperature=0.5)[0]["generated_text"]
97
  answer = result.split("CONTENT:")[-1].strip()
98
  full_summary += answer + "\n\n---\n\n"
99
 
100
- # Optional: Clear GPU memory after use
101
- torch.cuda.empty_cache()
102
-
103
  return full_summary
104
 
105
  # 🎨 Gradio Interface
106
  with gr.Blocks() as demo:
107
- gr.Markdown("## πŸ“„ Smart Document Analyzer – Tender & Technical Documents (GPU-Enhanced)")
108
- gr.Markdown("Upload a PDF or DOCX file. The app extracts manpower, timeline, technical needs, and budget details using a large LLM.")
109
 
110
  with gr.Row():
111
  file_input = gr.File(label="πŸ“Ž Upload PDF or Word Document")
 
1
  import gradio as gr
2
  import pdfplumber
3
  import docx
4
+ import easyocr
5
+ import numpy as np
6
+ from PIL import Image
7
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
8
  from huggingface_hub import login
9
  import torch
10
  import os
 
11
 
12
+ # πŸ” Authenticate using Hugging Face token (if needed for gated repos)
13
+ # If you're using public models, this can be commented out.
14
+ if os.environ.get("HF_TOKEN"):
15
+ login(token=os.environ["HF_TOKEN"])
16
 
17
+ # πŸš€ Check if GPU is available
18
+ if not torch.cuda.is_available():
19
+ raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
 
 
20
 
21
+ print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
22
+
23
+ # πŸ”§ Load model
24
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
25
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
26
+
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ model_id,
29
+ device_map="auto",
30
+ torch_dtype=torch.float16,
31
+ use_auth_token=True,
32
+ trust_remote_code=True
33
+ )
34
+
35
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
36
+
37
+ # 🧠 Load EasyOCR
38
+ reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
39
 
40
+ # πŸ“„ Extract text from PDF with OCR fallback
41
  def extract_text_from_pdf(file):
42
  text = ""
43
  with pdfplumber.open(file) as pdf:
 
45
  page_text = page.extract_text()
46
  if page_text:
47
  text += page_text + "\n"
48
+ else:
49
+ image = page.to_image(resolution=300).original.convert("RGB")
50
+ image_np = np.array(image)
51
+ ocr_result = reader.readtext(image_np, detail=0)
52
+ ocr_text = "\n".join(ocr_result)
53
+ if ocr_text.strip():
54
+ text += ocr_text + "\n"
55
  return text
56
 
57
  # πŸ“„ Extract text from DOCX
 
59
  doc = docx.Document(file)
60
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
61
 
62
+ # 🧩 Chunk text for LLM processing
63
  def chunk_text(text, max_chars=6000):
64
  paragraphs = text.split("\n")
65
  chunks, current_chunk = [], ""
 
73
  chunks.append(current_chunk)
74
  return chunks
75
 
76
+ # 🧠 LLM Prompt Template
77
  def create_prompt(text_chunk):
78
  return f"""You are an expert in analyzing tender and project documents. Read the following content and extract:
79
  1. Total manpower required
 
86
  {text_chunk}
87
  """
88
 
89
+ # πŸ” Main handler
 
90
  def analyze_document(file):
91
  filename = file.name
92
  ext = os.path.splitext(filename)[-1].lower()
 
101
  if len(raw_text.strip()) == 0:
102
  return "❌ No text found in the document."
103
 
 
 
 
 
 
 
 
 
 
 
 
104
  chunks = chunk_text(raw_text)
 
105
 
106
+ full_summary = ""
107
  for chunk in chunks:
108
  prompt = create_prompt(chunk)
109
  result = generator(prompt, max_new_tokens=512, do_sample=False, temperature=0.5)[0]["generated_text"]
110
  answer = result.split("CONTENT:")[-1].strip()
111
  full_summary += answer + "\n\n---\n\n"
112
 
 
 
 
113
  return full_summary
114
 
115
  # 🎨 Gradio Interface
116
  with gr.Blocks() as demo:
117
+ gr.Markdown("## πŸ“„ Smart Document Analyzer – Tender & Technical Documents (GPU-Powered, OCR-Ready)")
118
+ gr.Markdown("Upload a PDF or DOCX file. This tool extracts manpower, timeline, technical needs, and budget using a powerful LLM with OCR support for scanned PDFs.")
119
 
120
  with gr.Row():
121
  file_input = gr.File(label="πŸ“Ž Upload PDF or Word Document")