tejovanth commited on
Commit
9b9ef33
·
verified ·
1 Parent(s): 18c46de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import fitz # PyMuPDF
3
  import torch
4
  from transformers import pipeline
5
- import time, logging, re
6
  import matplotlib
7
  matplotlib.use('Agg')
8
  import matplotlib.pyplot as plt
@@ -62,24 +62,20 @@ def summarize_file(file_bytes):
62
  if not text.strip():
63
  return "❌ No text found", None
64
 
65
- text = text[:200000] # reduced slightly for speed
66
- chunks = [text[i:i+1000] for i in range(0, len(text), 1000)][:5] # smaller + more chunks
67
  summaries = []
68
 
69
  for i, chunk in enumerate(chunks):
70
  chunk_start = time.time()
71
  chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
72
 
73
- if time.time() - start > 20:
74
- summaries.append("⚠️ Stopped early")
75
- break
76
-
77
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
78
  summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
79
  chunk_result['status'] = 'skipped'
80
  else:
81
  try:
82
- summary = summarizer(chunk, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
83
  summaries.append(f"### Chunk {i+1}\n{summary}")
84
  chunk_result['status'] = 'summarized'
85
  except Exception as e:
@@ -98,12 +94,16 @@ def summarize_file(file_bytes):
98
  image = visualize_chunk_status(chunk_info)
99
  return final_summary, image
100
 
101
- def find_relevant_passages(text, question, num_passages=3):
102
  passages = re.split(r'(?<=[.?!])\s+', text)
103
  scored = []
 
104
  for passage in passages:
105
- score = sum(1 for word in question.lower().split() if word in passage.lower())
106
- scored.append((score, passage))
 
 
 
107
  scored.sort(reverse=True)
108
  best_passages = " ".join([p for _, p in scored[:num_passages]])
109
  return best_passages
@@ -114,7 +114,7 @@ def answer_question(file_bytes, question):
114
  text = "".join(page.get_text("text") for page in doc)
115
  text = re.sub(r"\s+", " ", text).strip()
116
  text = "".join(c for c in text if ord(c) < 128)
117
- context = text[:200000]
118
  except Exception as e:
119
  return f"❌ Text extraction failed: {str(e)}"
120
 
@@ -161,3 +161,4 @@ if __name__ == "__main__":
161
  ).launch(server_port=7860)
162
  except Exception as e:
163
  print(f"❌ Gradio launch failed: {str(e)}")
 
 
2
  import fitz # PyMuPDF
3
  import torch
4
  from transformers import pipeline
5
+ import time, logging, re, difflib
6
  import matplotlib
7
  matplotlib.use('Agg')
8
  import matplotlib.pyplot as plt
 
62
  if not text.strip():
63
  return "❌ No text found", None
64
 
65
+ text = text[:300000] # allow full but reasonable size
66
+ chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
67
  summaries = []
68
 
69
  for i, chunk in enumerate(chunks):
70
  chunk_start = time.time()
71
  chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
72
 
 
 
 
 
73
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
74
  summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
75
  chunk_result['status'] = 'skipped'
76
  else:
77
  try:
78
+ summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
79
  summaries.append(f"### Chunk {i+1}\n{summary}")
80
  chunk_result['status'] = 'summarized'
81
  except Exception as e:
 
94
  image = visualize_chunk_status(chunk_info)
95
  return final_summary, image
96
 
97
+ def find_relevant_passages(text, question, num_passages=5):
98
  passages = re.split(r'(?<=[.?!])\s+', text)
99
  scored = []
100
+ question_tokens = set(question.lower().split())
101
  for passage in passages:
102
+ passage_tokens = set(passage.lower().split())
103
+ match_score = len(question_tokens.intersection(passage_tokens))
104
+ if match_score == 0:
105
+ match_score = difflib.SequenceMatcher(None, question, passage).ratio()
106
+ scored.append((match_score, passage))
107
  scored.sort(reverse=True)
108
  best_passages = " ".join([p for _, p in scored[:num_passages]])
109
  return best_passages
 
114
  text = "".join(page.get_text("text") for page in doc)
115
  text = re.sub(r"\s+", " ", text).strip()
116
  text = "".join(c for c in text if ord(c) < 128)
117
+ context = text[:300000]
118
  except Exception as e:
119
  return f"❌ Text extraction failed: {str(e)}"
120
 
 
161
  ).launch(server_port=7860)
162
  except Exception as e:
163
  print(f"❌ Gradio launch failed: {str(e)}")
164
+