tejovanth commited on
Commit
40cfd28
Β·
verified Β·
1 Parent(s): 2e40204

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -17
app.py CHANGED
@@ -9,7 +9,6 @@ import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
11
 
12
- # Logging and setup
13
  logging.basicConfig(level=logging.ERROR)
14
  device = -1 # CPU-only
15
  print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
@@ -21,14 +20,13 @@ except Exception as e:
21
  print(f"❌ Summarizer model loading failed: {str(e)}")
22
  exit(1)
23
 
24
- # Load question-answering model
25
  try:
26
- qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device=device)
27
  except Exception as e:
28
  print(f"❌ QA model loading failed: {str(e)}")
29
  exit(1)
30
 
31
- # Function: Visualize chunk processing status
32
  def visualize_chunk_status(chunk_data):
33
  status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
34
  labels = [f"C{i['chunk']}" for i in chunk_data]
@@ -47,7 +45,6 @@ def visualize_chunk_status(chunk_data):
47
  plt.close(fig)
48
  return Image.open(buf)
49
 
50
- # Function: Summarization
51
  def summarize_file(file_bytes):
52
  start = time.time()
53
  chunk_info = []
@@ -66,7 +63,7 @@ def summarize_file(file_bytes):
66
  return "❌ No text found", None
67
 
68
  text = text[:300000]
69
- chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
70
  summaries = []
71
 
72
  for i, chunk in enumerate(chunks):
@@ -78,25 +75,31 @@ def summarize_file(file_bytes):
78
  break
79
 
80
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
81
- summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
82
  chunk_result['status'] = 'skipped'
83
  else:
84
  try:
85
  summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
86
- summaries.append(f"**Chunk {i+1}**:\n{summary}")
87
  chunk_result['status'] = 'summarized'
88
  except Exception as e:
89
- summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
90
  chunk_result['status'] = 'error'
91
 
92
  chunk_result['time'] = time.time() - chunk_start
93
  chunk_info.append(chunk_result)
94
 
95
- final_summary = f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
 
 
 
 
 
 
 
96
  image = visualize_chunk_status(chunk_info)
97
  return final_summary, image
98
 
99
- # Function: QA from PDF
100
  def answer_question(file_bytes, question):
101
  try:
102
  doc = fitz.open(stream=file_bytes, filetype="pdf")
@@ -116,19 +119,19 @@ def answer_question(file_bytes, question):
116
  except Exception as e:
117
  return f"❌ QA failed: {str(e)}"
118
 
119
- # Gradio UI for Summarizer
120
  summarizer_ui = gr.Interface(
121
  fn=summarize_file,
122
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
123
  outputs=[
124
- gr.Textbox(label="πŸ“ Summarized Output"),
125
  gr.Image(label="πŸ“Š Visual Process Flow", type="pil")
126
  ],
127
  title="πŸ“ AI-Powered PDF Summarizer",
128
- description="Summarizes long PDFs (up to 300,000 characters) and visualizes chunk-level automation status."
129
  )
130
 
131
- # Gradio UI for Q&A
132
  qa_ui = gr.Interface(
133
  fn=answer_question,
134
  inputs=[
@@ -137,10 +140,10 @@ qa_ui = gr.Interface(
137
  ],
138
  outputs=gr.Textbox(label="πŸ” Answer"),
139
  title="πŸ“š PDF Q&A Assistant",
140
- description="Ask natural language questions based on the uploaded PDF content."
141
  )
142
 
143
- # Combine both in tabs
144
  if __name__ == "__main__":
145
  try:
146
  gr.TabbedInterface(
 
9
  import io
10
  from PIL import Image
11
 
 
12
  logging.basicConfig(level=logging.ERROR)
13
  device = -1 # CPU-only
14
  print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
 
20
  print(f"❌ Summarizer model loading failed: {str(e)}")
21
  exit(1)
22
 
23
+ # Load better QA model
24
  try:
25
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=device)
26
  except Exception as e:
27
  print(f"❌ QA model loading failed: {str(e)}")
28
  exit(1)
29
 
 
30
  def visualize_chunk_status(chunk_data):
31
  status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
32
  labels = [f"C{i['chunk']}" for i in chunk_data]
 
45
  plt.close(fig)
46
  return Image.open(buf)
47
 
 
48
  def summarize_file(file_bytes):
49
  start = time.time()
50
  chunk_info = []
 
63
  return "❌ No text found", None
64
 
65
  text = text[:300000]
66
+ chunks = [text[i:i+2000] for i in range(0, len(text), 2000)][:3] # Limit to 3 chunks for testing
67
  summaries = []
68
 
69
  for i, chunk in enumerate(chunks):
 
75
  break
76
 
77
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
78
+ summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
79
  chunk_result['status'] = 'skipped'
80
  else:
81
  try:
82
  summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
83
+ summaries.append(f"### Chunk {i+1}\n{summary}")
84
  chunk_result['status'] = 'summarized'
85
  except Exception as e:
86
+ summaries.append(f"### Chunk {i+1}: ❌ Error: {str(e)}")
87
  chunk_result['status'] = 'error'
88
 
89
  chunk_result['time'] = time.time() - chunk_start
90
  chunk_info.append(chunk_result)
91
 
92
+ formatted_chunks = "\n\n---\n\n".join(summaries)
93
+ final_summary = f"""**Characters Processed**: {len(text)}
94
+ **Total Time**: {time.time()-start:.2f} seconds
95
+
96
+ ## πŸ”Ή Summary by Chunks
97
+
98
+ {formatted_chunks}
99
+ """
100
  image = visualize_chunk_status(chunk_info)
101
  return final_summary, image
102
 
 
103
  def answer_question(file_bytes, question):
104
  try:
105
  doc = fitz.open(stream=file_bytes, filetype="pdf")
 
119
  except Exception as e:
120
  return f"❌ QA failed: {str(e)}"
121
 
122
+ # Summarizer UI
123
  summarizer_ui = gr.Interface(
124
  fn=summarize_file,
125
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
126
  outputs=[
127
+ gr.Textbox(label="πŸ“ Summarized Output", lines=30, show_copy_button=True),
128
  gr.Image(label="πŸ“Š Visual Process Flow", type="pil")
129
  ],
130
  title="πŸ“ AI-Powered PDF Summarizer",
131
+ description="Summarizes long PDFs and visualizes chunk-level processing (limited to 3 chunks for testing)."
132
  )
133
 
134
+ # Q&A UI
135
  qa_ui = gr.Interface(
136
  fn=answer_question,
137
  inputs=[
 
140
  ],
141
  outputs=gr.Textbox(label="πŸ” Answer"),
142
  title="πŸ“š PDF Q&A Assistant",
143
+ description="Ask natural language questions from the uploaded PDF."
144
  )
145
 
146
+ # Tabs
147
  if __name__ == "__main__":
148
  try:
149
  gr.TabbedInterface(