sejalkishan commited on
Commit
da8f4ee
Β·
verified Β·
1 Parent(s): 61b6f1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -60
app.py CHANGED
@@ -8,19 +8,20 @@ import pytesseract
8
  import torch
9
  import os
10
  import spaces
 
11
 
12
- # πŸ” Authenticate Hugging Face token
13
  login(token=os.environ.get("token"))
14
 
15
- # βœ… Ensure GPU is available
16
  if not torch.cuda.is_available():
17
  raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
18
  print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
19
 
20
- # 🧠 Model
21
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
22
 
23
- # πŸ“„ Document extractors
24
  def extract_text_from_pdf(file):
25
  text = ""
26
  with pdfplumber.open(file) as pdf:
@@ -51,76 +52,82 @@ def chunk_text(text, max_chars=6000):
51
  chunks.append(current_chunk)
52
  return chunks
53
 
54
- # 🧾 Q&A Prompt Template
55
  def create_prompt(text_chunk):
56
  return f"""
57
  You are an expert in analyzing U.S. government tender documents. Based only on the content provided below, answer the following 20 standard questions in Q&A format. If something is not mentioned, write "Not mentioned in the provided document."
58
-
59
  CONTENT:
60
  {text_chunk}
61
-
62
  Now provide answers for:
63
-
64
- Q1: What is the general scope of the tender?
65
- Q2: Are certifications like SAM, CMMI, ISO, SBA, 8(a), or GSA required?
66
- Q3: Is there a Set-aside status (e.g., 8a, SDVOSB)?
67
- Q4: Are U.S. citizens or security-cleared staff required?
68
- Q5: What is the expected team size or key qualifications?
69
- Q6: Are offshore resources allowed?
70
- Q7: What is the mode of working (On-site/Remote/Hybrid)?
71
- Q8: Is presence in specific regions/states required?
72
- Q9: Is the delivery location defined?
73
- Q10: Is remote or offshore delivery allowed?
74
- Q11: Is a U.S. office presence required?
75
- Q12: Are travel/lodging expenses reimbursable?
76
- Q13: Are cybersecurity frameworks (FedRAMP, NIST, HIPAA) required?
77
- Q14: Are background checks or security clearance needed?
78
- Q15: Is past experience required?
79
- Q16: How many references are required?
80
- Q17: Are only U.S. references accepted?
81
- Q18: Is private sector experience allowed?
82
- Q19: Do references need to be identified?
83
- Q20: Is subcontracting permitted?
84
-
 
 
 
85
  ...
86
  """
87
 
88
- # 🧼 Cleaner
89
  def clean_output(raw_output):
90
- # Find first valid Q1
91
- start_idx = raw_output.find("Q1:")
92
- if start_idx == -1:
93
- return raw_output.strip()
 
 
 
 
 
94
 
95
- cleaned = raw_output[start_idx:]
96
 
97
- # Remove everything after second instance of Q1 (to drop repeated prompts)
98
- second_q1 = cleaned.find("Q1:", 3) # skip first one
99
- if second_q1 != -1:
100
- cleaned = cleaned[:second_q1]
101
 
102
- # Drop leftover instructions if they show up later
103
- cut_phrases = ["You are an expert", "Now provide answers", "CONTENT:", "Answer clearly and in the same format:"]
104
- for phrase in cut_phrases:
105
- if phrase in cleaned:
106
- cleaned = cleaned.split(phrase)[0]
107
 
108
- return cleaned.strip()
109
 
110
- # πŸš€ Main analysis function
111
  @spaces.GPU(duration=150)
112
- def analyze_document(file, cancel_flag):
113
- ext = os.path.splitext(file.name)[-1].lower()
 
114
 
115
  if ext == ".pdf":
116
  raw_text = extract_text_from_pdf(file)
117
  elif ext == ".docx":
118
  raw_text = extract_text_from_docx(file)
119
  else:
120
- return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format"
121
 
122
  if len(raw_text.strip()) == 0:
123
- return "❌ No text found in the document.", "❌ Empty document"
124
 
125
  chunks = chunk_text(raw_text)
126
  full_summary = ""
@@ -136,16 +143,17 @@ def analyze_document(file, cancel_flag):
136
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
137
 
138
  for i, chunk in enumerate(chunks):
139
- if cancel_flag:
140
- return "β›” Analysis cancelled by user.", "β›” Terminated by user"
141
 
142
- status_msg = f"πŸ”„ Processing chunk {i+1} of {len(chunks)}..."
143
  prompt = create_prompt(chunk)
144
  result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
145
- cleaned = clean_output(result)
146
- full_summary += cleaned + "\n\n---\n\n"
147
 
148
- return full_summary.strip(), "βœ… Completed"
 
149
 
150
  # 🌐 Gradio Interface
151
  with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
@@ -157,17 +165,17 @@ with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
157
  with gr.Row():
158
  analyze_button = gr.Button("πŸ” Analyze", variant="primary")
159
  terminate_button = gr.Button("❌ Terminate", variant="stop")
160
- status_box = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting for input...", interactive=False)
161
 
162
  with gr.Column(scale=2):
163
  output_box = gr.Textbox(label="🧠 Extracted Tender Intelligence", lines=30, interactive=False)
164
 
165
- cancel_flag = gr.State(False)
166
 
167
  analyze_button.click(
168
  fn=analyze_document,
169
- inputs=[file_input, cancel_flag],
170
- outputs=[output_box, status_box]
171
  )
172
 
173
  terminate_button.click(
 
8
  import torch
9
  import os
10
  import spaces
11
+ import re
12
 
13
+ # πŸ” Hugging Face authentication
14
  login(token=os.environ.get("token"))
15
 
16
+ # βœ… Check GPU availability
17
  if not torch.cuda.is_available():
18
  raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
19
  print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
20
 
21
+ # 🧠 Model ID
22
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
23
 
24
+ # πŸ“„ Extractor for PDF (with OCR) and DOCX
25
  def extract_text_from_pdf(file):
26
  text = ""
27
  with pdfplumber.open(file) as pdf:
 
52
  chunks.append(current_chunk)
53
  return chunks
54
 
55
+ # 🧾 Prompt optimized for 20 Q&A
56
  def create_prompt(text_chunk):
57
  return f"""
58
  You are an expert in analyzing U.S. government tender documents. Based only on the content provided below, answer the following 20 standard questions in Q&A format. If something is not mentioned, write "Not mentioned in the provided document."
 
59
  CONTENT:
60
  {text_chunk}
 
61
  Now provide answers for:
62
+ 1. What is the general scope of the tender?
63
+ 2. Are certifications like SAM, CMMI, ISO, SBA, 8(a), or GSA required?
64
+ 3. Is there a Set-aside status (e.g., 8a, SDVOSB)?
65
+ 4. Are U.S. citizens or security-cleared staff required?
66
+ 5. What is the expected team size or key qualifications?
67
+ 6. Are offshore resources allowed?
68
+ 7. What is the mode of working (On-site/Remote/Hybrid)?
69
+ 8. Is presence in specific regions/states required?
70
+ 9. Is the delivery location defined?
71
+ 10. Is remote or offshore delivery allowed?
72
+ 11. Is a U.S. office presence required?
73
+ 12. Are travel/lodging expenses reimbursable?
74
+ 13. Are cybersecurity frameworks (FedRAMP, NIST, HIPAA) required?
75
+ 14. Are background checks or security clearance needed?
76
+ 15. Is past experience required?
77
+ 16. How many references are required?
78
+ 17. Are only U.S. references accepted?
79
+ 18. Is private sector experience allowed?
80
+ 19. Do references need to be identified?
81
+ 20. Is subcontracting permitted?
82
+ Answer in this format:
83
+ Q1: ...
84
+ A1: ...
85
+ Q2: ...
86
+ A2: ...
87
  ...
88
  """
89
 
90
+ # βœ… Clean model output to remove repeated prompt content
91
  def clean_output(raw_output):
92
+ lines = raw_output.splitlines()
93
+ cleaned = []
94
+ capture = False
95
+
96
+ for line in lines:
97
+ if line.strip().startswith("Q1:"):
98
+ capture = True
99
+ if capture:
100
+ cleaned.append(line)
101
 
102
+ text = "\n".join(cleaned)
103
 
104
+ # Remove any repeated question block after A20
105
+ if "Q20:" in text:
106
+ text = text.split("Q20:")[0] + "Q20: Is subcontracting permitted?"
 
107
 
108
+ # Trim content after A20 if any
109
+ match = re.search(r"(A20:.*?)\n", text, re.DOTALL)
110
+ if match:
111
+ end = match.end()
112
+ text = text[:end].strip()
113
 
114
+ return text.strip()
115
 
116
+ # πŸ” GPU-enabled analyzer
117
  @spaces.GPU(duration=150)
118
+ def analyze_document(file, status_text, cancel_flag):
119
+ filename = file.name
120
+ ext = os.path.splitext(filename)[-1].lower()
121
 
122
  if ext == ".pdf":
123
  raw_text = extract_text_from_pdf(file)
124
  elif ext == ".docx":
125
  raw_text = extract_text_from_docx(file)
126
  else:
127
+ return "❌ Unsupported file format. Please upload a PDF or DOCX."
128
 
129
  if len(raw_text.strip()) == 0:
130
+ return "❌ No text found in the document."
131
 
132
  chunks = chunk_text(raw_text)
133
  full_summary = ""
 
143
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
144
 
145
  for i, chunk in enumerate(chunks):
146
+ if cancel_flag.value:
147
+ return "β›” Analysis cancelled by user."
148
 
149
+ status_text.value = f"πŸ”„ Processing chunk {i+1} of {len(chunks)}..."
150
  prompt = create_prompt(chunk)
151
  result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
152
+ answer = clean_output(result)
153
+ full_summary += answer + "\n\n---\n\n"
154
 
155
+ status_text.value = "βœ… Completed"
156
+ return full_summary.strip()
157
 
158
  # 🌐 Gradio Interface
159
  with gr.Blocks(title="Smart Tender Analyzer - US Edition") as demo:
 
165
  with gr.Row():
166
  analyze_button = gr.Button("πŸ” Analyze", variant="primary")
167
  terminate_button = gr.Button("❌ Terminate", variant="stop")
168
+ status_text = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting for input...", interactive=False)
169
 
170
  with gr.Column(scale=2):
171
  output_box = gr.Textbox(label="🧠 Extracted Tender Intelligence", lines=30, interactive=False)
172
 
173
+ cancel_flag = gr.State(value=False)
174
 
175
  analyze_button.click(
176
  fn=analyze_document,
177
+ inputs=[file_input, status_text, cancel_flag],
178
+ outputs=output_box
179
  )
180
 
181
  terminate_button.click(