admin08077 commited on
Commit
0ca4cd3
·
verified ·
1 Parent(s): 42a9179

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -93
app.py CHANGED
@@ -63,19 +63,13 @@ encoding = tiktoken.get_encoding("cl100k_base")
63
  ###############################################################################
64
 
65
  def approximate_tokens(text: str) -> int:
66
- # Return an approximate token count using the chosen tokenizer
67
  return len(encoding.encode(text))
68
 
69
  def chunk_text(text, max_chunk_size=1500):
70
- """
71
- Splits text into chunks of <= max_chunk_size tokens (approx).
72
- We'll do a naive approach: break on sentence boundaries from nltk.
73
- """
74
  sentences = nltk.sent_tokenize(text)
75
  chunks = []
76
  current_chunk = ""
77
  current_tokens = 0
78
-
79
  for sent in sentences:
80
  sent_tokens = approximate_tokens(sent)
81
  if current_tokens + sent_tokens <= max_chunk_size:
@@ -91,80 +85,53 @@ def chunk_text(text, max_chunk_size=1500):
91
  return chunks
92
 
93
  def chunk_summarize(text):
94
- """
95
- Summarize large text by chunking and then joining partial summaries.
96
- """
97
- chunks = chunk_text(text, max_chunk_size=600) # 600 tokens ~ smaller chunk for BART
98
  summaries = []
99
  for ch in chunks:
100
- # Summarize each chunk
101
  out = summarizer(ch, max_length=150, min_length=40, do_sample=False)
102
  summaries.append(out[0]["summary_text"])
103
- # Optionally summarize the summaries again if needed
104
  combined = " ".join(summaries)
105
  if len(chunks) > 1:
106
- # Summarize the combined result to get a final summary
107
  final = summarizer(combined, max_length=150, min_length=40, do_sample=False)
108
  return final[0]["summary_text"]
109
  else:
110
  return combined
111
 
112
  def do_topic_detection(text, candidate_labels=None):
113
- """
114
- Zero-shot classify the text. If no candidate_labels given, use a default.
115
- """
116
  if candidate_labels is None:
117
  candidate_labels = [
118
- "legal", "technical", "creative", "finance", "sports", "health", "politics",
119
- "education", "entertainment", "business"
120
  ]
121
- # We'll chunk the text to keep it from being too large
122
  chunks = chunk_text(text, max_chunk_size=512)
123
- # We'll do a naive approach: classify each chunk, take the top label
124
  label_counts = {}
125
  for ch in chunks:
126
  result = zero_shot_classifier(ch, candidate_labels)
127
  top_label = result["labels"][0]
128
  label_counts[top_label] = label_counts.get(top_label, 0) + 1
129
- # Return the top 3
130
  sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
131
  top_labels = [lbl for (lbl, _) in sorted_labels[:3]]
132
  return top_labels
133
 
134
  def do_ocr_on_image(image_bytes):
135
- """
136
- OCR an image (bytes) using Tesseract.
137
- """
138
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
139
  return pytesseract.image_to_string(image)
140
 
141
  def is_page_scanned(page_text):
142
- """If PyPDF2 doesn't return text or it's extremely short, we assume scanned."""
143
- if not page_text or len(page_text.strip()) < 20:
144
- return True
145
- return False
146
 
147
  def extract_text_from_pdf(pdf_file) -> str:
148
- """
149
- Attempt to extract text from each page using PyPDF2.
150
- If a page appears scanned, fallback to OCR using Tesseract.
151
- """
152
  reader = PyPDF2.PdfReader(pdf_file)
153
  all_text = []
154
  for page_index, page in enumerate(reader.pages):
155
- # Try native extraction
156
  extracted = page.extract_text()
157
  if not extracted or is_page_scanned(extracted):
158
- # Convert page to image and apply OCR
159
  try:
160
- # Extract single page as a separate PDF
161
  writer = PyPDF2.PdfWriter()
162
  writer.add_page(page)
163
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
164
  writer.write(temp_pdf)
165
  temp_pdf_path = temp_pdf.name
166
-
167
- # Convert PDF to image using pdf2image
168
  from pdf2image import convert_from_path
169
  images = convert_from_path(temp_pdf_path)
170
  if images:
@@ -196,7 +163,6 @@ def parse_xml(file_obj):
196
  return raw.decode("utf-8", errors="ignore")
197
 
198
  def parse_image(file_obj):
199
- # OCR the image
200
  image_bytes = file_obj.read()
201
  return do_ocr_on_image(image_bytes)
202
 
@@ -206,26 +172,18 @@ def get_file_extension(filename):
206
  ###############################################################################
207
  # 3. Data Structures & In-Memory Store #
208
  ###############################################################################
209
- # We'll store user sessions in a dict: session_id -> {files: {...}, chat_history: [...]}
210
 
211
  SESSIONS = {}
212
-
213
  def create_session():
214
  return str(uuid.uuid4())
215
 
216
  ###############################################################################
217
  # 4. Multi-File Upload and Analysis #
218
  ###############################################################################
 
219
  def load_files(files, session_id):
220
- """
221
- 1. For each file, parse the text,
222
- 2. Summarize it (chunk-based),
223
- 3. Detect topics,
224
- 4. Store page-by-page for reference search.
225
- """
226
  if session_id not in SESSIONS:
227
  SESSIONS[session_id] = {"files": {}, "chat_history": []}
228
-
229
  results = []
230
  for f in files:
231
  ext = get_file_extension(f.name)
@@ -242,12 +200,8 @@ def load_files(files, session_id):
242
  content = parse_txt(f)
243
  else:
244
  content = parse_txt(f)
245
-
246
- # Summarize
247
  summary = chunk_summarize(content) if content.strip() else ""
248
- # Topics
249
  topics = do_topic_detection(content) if content.strip() else []
250
- # Page-level split (for reference search)
251
  pages_text = []
252
  if ext == "pdf":
253
  f.seek(0)
@@ -257,23 +211,16 @@ def load_files(files, session_id):
257
  pages_text.append(ptext)
258
  else:
259
  pages_text.append(content)
260
-
261
- # Stats
262
  total_words = len(content.split())
263
  total_tokens = approximate_tokens(content)
264
-
265
  SESSIONS[session_id]["files"][f.name] = {
266
  "ext": ext,
267
  "content": content,
268
  "summary": summary,
269
  "topics": topics,
270
  "pages": pages_text,
271
- "stats": {
272
- "words": total_words,
273
- "tokens": total_tokens,
274
- }
275
  }
276
-
277
  result_str = f"**File:** {f.name}\n - Words: {total_words}, Tokens: {total_tokens}\n - Topics: {topics}\n - Summary: {summary[:200]}..."
278
  results.append(result_str)
279
  except Exception as e:
@@ -300,11 +247,8 @@ def kill_session(session_id):
300
  ###############################################################################
301
  # 5. Reference Finder (Page-Based) #
302
  ###############################################################################
 
303
  def find_reference(session_id, query):
304
- """
305
- Naively search each page (or full text for non-PDFs) for the query,
306
- then return a snippet.
307
- """
308
  if session_id not in SESSIONS:
309
  return "No session."
310
  results = []
@@ -322,11 +266,8 @@ def find_reference(session_id, query):
322
  ###############################################################################
323
  # 6. Q&A with Chunk-Based Retrieval #
324
  ###############################################################################
 
325
  def retrieve_relevant_chunks(session_id, question):
326
- """
327
- Combine file contents, chunk them, and select the top chunks matching the question.
328
- (A real implementation might use embeddings; this uses a naive approach.)
329
- """
330
  if session_id not in SESSIONS:
331
  return []
332
  text_blocks = []
@@ -345,9 +286,6 @@ def retrieve_relevant_chunks(session_id, question):
345
  return top_chunks
346
 
347
  def answer_question(session_id, question):
348
- """
349
- Use chunk-based QA (with roberta-base-squad2) on the top relevant chunks and return the best answer.
350
- """
351
  top_chunks = retrieve_relevant_chunks(session_id, question)
352
  if not top_chunks:
353
  return "No relevant chunks found in the uploaded files."
@@ -362,11 +300,8 @@ def answer_question(session_id, question):
362
  ###############################################################################
363
  # 7. Chat-Like Interface #
364
  ###############################################################################
 
365
  def chat(user_input, chat_history, session_id):
366
- """
367
- Append the user query to the chat history, run QA,
368
- and append the response. Displays approximate token usage.
369
- """
370
  if session_id not in SESSIONS:
371
  SESSIONS[session_id] = {"files": {}, "chat_history": []}
372
  if user_input.lower().startswith("ref:"):
@@ -385,10 +320,8 @@ def chat(user_input, chat_history, session_id):
385
  ###############################################################################
386
  # 8. Voice Integration (STT Only) #
387
  ###############################################################################
 
388
  def transcribe_audio(audio):
389
- """
390
- Transcribe the uploaded audio using the local Whisper tiny model.
391
- """
392
  if audio is None:
393
  return ""
394
  filepath = audio
@@ -411,53 +344,53 @@ def reset_session():
411
  with gr.Blocks() as demo:
412
  gr.Markdown("# **All-in-One Local File QA + OCR + Summaries + Topics + Voice (STT Only)**")
413
  session_id = gr.State(create_session())
414
-
415
  with gr.Column():
416
  gr.Markdown("### 1. File Upload & Analysis")
417
  file_uploader = gr.File(file_count="multiple", label="Upload your files (PDF, images, TXT, JSON, XML)")
418
  upload_btn = gr.Button("Process Files")
419
  upload_output = gr.Markdown()
420
-
421
  def on_upload(files, sid):
422
  return load_files(files, sid)
423
-
424
  upload_btn.click(on_upload, inputs=[file_uploader, session_id], outputs=upload_output)
425
-
426
  insights_btn = gr.Button("Show File Insights")
427
  insights_output = gr.Markdown()
428
  insights_btn.click(fn=show_file_insights, inputs=[session_id], outputs=insights_output)
429
-
430
  kill_btn = gr.Button("Kill Session")
431
  kill_msg = gr.Markdown()
432
  kill_btn.click(fn=kill_session, inputs=[session_id], outputs=kill_msg)
433
-
434
  new_session_btn = gr.Button("Reset Session")
435
  new_session_out = gr.Markdown()
436
  new_session_btn.click(fn=reset_session, outputs=[session_id, new_session_out])
437
-
438
  gr.Markdown("### 2. Voice Input (STT Only)")
439
- # Removed the "source" argument as it is not supported in this version.
440
  audio_in = gr.Audio(type="filepath", label="Speak your question")
441
  stt_btn = gr.Button("Transcribe")
442
  stt_output = gr.Textbox(label="Transcribed Text")
443
  stt_btn.click(fn=transcribe_audio, inputs=[audio_in], outputs=[stt_output])
444
-
445
  gr.Markdown("### 3. Chat / Q&A (Enter text below)")
446
- chatbot = gr.Chatbot(label="Chat History")
447
  user_input = gr.Textbox(label="Your question (or 'ref: <term>' for reference search)", lines=2)
448
  send_btn = gr.Button("Send")
449
-
450
  def user_message(user_msg, history):
451
  history = history + [[user_msg, None]]
452
  return "", history
453
-
454
  send_btn.click(fn=user_message, inputs=[user_input, chatbot], outputs=[user_input, chatbot], queue=False)
455
-
456
  def bot_message(history, sid):
 
 
 
457
  user_msg = history[-1][0]
458
- _, updated_history = chat(user_msg, history[:-1], sid)
459
  return updated_history
460
-
461
  send_btn.click(fn=bot_message, inputs=[chatbot, session_id], outputs=[chatbot])
462
-
463
  demo.queue().launch(server_name="0.0.0.0", server_port=7860)
 
63
  ###############################################################################
64
 
65
  def approximate_tokens(text: str) -> int:
 
66
  return len(encoding.encode(text))
67
 
68
  def chunk_text(text, max_chunk_size=1500):
 
 
 
 
69
  sentences = nltk.sent_tokenize(text)
70
  chunks = []
71
  current_chunk = ""
72
  current_tokens = 0
 
73
  for sent in sentences:
74
  sent_tokens = approximate_tokens(sent)
75
  if current_tokens + sent_tokens <= max_chunk_size:
 
85
  return chunks
86
 
87
  def chunk_summarize(text):
88
+ chunks = chunk_text(text, max_chunk_size=600)
 
 
 
89
  summaries = []
90
  for ch in chunks:
 
91
  out = summarizer(ch, max_length=150, min_length=40, do_sample=False)
92
  summaries.append(out[0]["summary_text"])
 
93
  combined = " ".join(summaries)
94
  if len(chunks) > 1:
 
95
  final = summarizer(combined, max_length=150, min_length=40, do_sample=False)
96
  return final[0]["summary_text"]
97
  else:
98
  return combined
99
 
100
  def do_topic_detection(text, candidate_labels=None):
 
 
 
101
  if candidate_labels is None:
102
  candidate_labels = [
103
+ "legal", "technical", "creative", "finance", "sports", "health",
104
+ "politics", "education", "entertainment", "business"
105
  ]
 
106
  chunks = chunk_text(text, max_chunk_size=512)
 
107
  label_counts = {}
108
  for ch in chunks:
109
  result = zero_shot_classifier(ch, candidate_labels)
110
  top_label = result["labels"][0]
111
  label_counts[top_label] = label_counts.get(top_label, 0) + 1
 
112
  sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
113
  top_labels = [lbl for (lbl, _) in sorted_labels[:3]]
114
  return top_labels
115
 
116
  def do_ocr_on_image(image_bytes):
 
 
 
117
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
118
  return pytesseract.image_to_string(image)
119
 
120
  def is_page_scanned(page_text):
121
+ return not page_text or len(page_text.strip()) < 20
 
 
 
122
 
123
  def extract_text_from_pdf(pdf_file) -> str:
 
 
 
 
124
  reader = PyPDF2.PdfReader(pdf_file)
125
  all_text = []
126
  for page_index, page in enumerate(reader.pages):
 
127
  extracted = page.extract_text()
128
  if not extracted or is_page_scanned(extracted):
 
129
  try:
 
130
  writer = PyPDF2.PdfWriter()
131
  writer.add_page(page)
132
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
133
  writer.write(temp_pdf)
134
  temp_pdf_path = temp_pdf.name
 
 
135
  from pdf2image import convert_from_path
136
  images = convert_from_path(temp_pdf_path)
137
  if images:
 
163
  return raw.decode("utf-8", errors="ignore")
164
 
165
  def parse_image(file_obj):
 
166
  image_bytes = file_obj.read()
167
  return do_ocr_on_image(image_bytes)
168
 
 
172
  ###############################################################################
173
  # 3. Data Structures & In-Memory Store #
174
  ###############################################################################
 
175
 
176
  SESSIONS = {}
 
177
  def create_session():
178
  return str(uuid.uuid4())
179
 
180
  ###############################################################################
181
  # 4. Multi-File Upload and Analysis #
182
  ###############################################################################
183
+
184
  def load_files(files, session_id):
 
 
 
 
 
 
185
  if session_id not in SESSIONS:
186
  SESSIONS[session_id] = {"files": {}, "chat_history": []}
 
187
  results = []
188
  for f in files:
189
  ext = get_file_extension(f.name)
 
200
  content = parse_txt(f)
201
  else:
202
  content = parse_txt(f)
 
 
203
  summary = chunk_summarize(content) if content.strip() else ""
 
204
  topics = do_topic_detection(content) if content.strip() else []
 
205
  pages_text = []
206
  if ext == "pdf":
207
  f.seek(0)
 
211
  pages_text.append(ptext)
212
  else:
213
  pages_text.append(content)
 
 
214
  total_words = len(content.split())
215
  total_tokens = approximate_tokens(content)
 
216
  SESSIONS[session_id]["files"][f.name] = {
217
  "ext": ext,
218
  "content": content,
219
  "summary": summary,
220
  "topics": topics,
221
  "pages": pages_text,
222
+ "stats": {"words": total_words, "tokens": total_tokens}
 
 
 
223
  }
 
224
  result_str = f"**File:** {f.name}\n - Words: {total_words}, Tokens: {total_tokens}\n - Topics: {topics}\n - Summary: {summary[:200]}..."
225
  results.append(result_str)
226
  except Exception as e:
 
247
  ###############################################################################
248
  # 5. Reference Finder (Page-Based) #
249
  ###############################################################################
250
+
251
  def find_reference(session_id, query):
 
 
 
 
252
  if session_id not in SESSIONS:
253
  return "No session."
254
  results = []
 
266
  ###############################################################################
267
  # 6. Q&A with Chunk-Based Retrieval #
268
  ###############################################################################
269
+
270
  def retrieve_relevant_chunks(session_id, question):
 
 
 
 
271
  if session_id not in SESSIONS:
272
  return []
273
  text_blocks = []
 
286
  return top_chunks
287
 
288
  def answer_question(session_id, question):
 
 
 
289
  top_chunks = retrieve_relevant_chunks(session_id, question)
290
  if not top_chunks:
291
  return "No relevant chunks found in the uploaded files."
 
300
  ###############################################################################
301
  # 7. Chat-Like Interface #
302
  ###############################################################################
303
+
304
  def chat(user_input, chat_history, session_id):
 
 
 
 
305
  if session_id not in SESSIONS:
306
  SESSIONS[session_id] = {"files": {}, "chat_history": []}
307
  if user_input.lower().startswith("ref:"):
 
320
  ###############################################################################
321
  # 8. Voice Integration (STT Only) #
322
  ###############################################################################
323
+
324
  def transcribe_audio(audio):
 
 
 
325
  if audio is None:
326
  return ""
327
  filepath = audio
 
344
  with gr.Blocks() as demo:
345
  gr.Markdown("# **All-in-One Local File QA + OCR + Summaries + Topics + Voice (STT Only)**")
346
  session_id = gr.State(create_session())
347
+
348
  with gr.Column():
349
  gr.Markdown("### 1. File Upload & Analysis")
350
  file_uploader = gr.File(file_count="multiple", label="Upload your files (PDF, images, TXT, JSON, XML)")
351
  upload_btn = gr.Button("Process Files")
352
  upload_output = gr.Markdown()
353
+
354
  def on_upload(files, sid):
355
  return load_files(files, sid)
 
356
  upload_btn.click(on_upload, inputs=[file_uploader, session_id], outputs=upload_output)
357
+
358
  insights_btn = gr.Button("Show File Insights")
359
  insights_output = gr.Markdown()
360
  insights_btn.click(fn=show_file_insights, inputs=[session_id], outputs=insights_output)
361
+
362
  kill_btn = gr.Button("Kill Session")
363
  kill_msg = gr.Markdown()
364
  kill_btn.click(fn=kill_session, inputs=[session_id], outputs=kill_msg)
365
+
366
  new_session_btn = gr.Button("Reset Session")
367
  new_session_out = gr.Markdown()
368
  new_session_btn.click(fn=reset_session, outputs=[session_id, new_session_out])
369
+
370
  gr.Markdown("### 2. Voice Input (STT Only)")
371
+ # Removed the 'source' parameter because it is not supported in this version.
372
  audio_in = gr.Audio(type="filepath", label="Speak your question")
373
  stt_btn = gr.Button("Transcribe")
374
  stt_output = gr.Textbox(label="Transcribed Text")
375
  stt_btn.click(fn=transcribe_audio, inputs=[audio_in], outputs=[stt_output])
376
+
377
  gr.Markdown("### 3. Chat / Q&A (Enter text below)")
378
+ chatbot = gr.Chatbot(label="Chat History", type="messages")
379
  user_input = gr.Textbox(label="Your question (or 'ref: <term>' for reference search)", lines=2)
380
  send_btn = gr.Button("Send")
381
+
382
  def user_message(user_msg, history):
383
  history = history + [[user_msg, None]]
384
  return "", history
 
385
  send_btn.click(fn=user_message, inputs=[user_input, chatbot], outputs=[user_input, chatbot], queue=False)
386
+
387
  def bot_message(history, sid):
388
+ # Check if history is empty
389
+ if not history:
390
+ return []
391
  user_msg = history[-1][0]
392
+ _, updated_history = chat(user_msg, history, sid)
393
  return updated_history
 
394
  send_btn.click(fn=bot_message, inputs=[chatbot, session_id], outputs=[chatbot])
395
+
396
  demo.queue().launch(server_name="0.0.0.0", server_port=7860)