tejovanth commited on
Commit
2e40204
Β·
verified Β·
1 Parent(s): ef48701

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -8
app.py CHANGED
@@ -1,24 +1,34 @@
1
  import gradio as gr
2
- import fitz
3
  import torch
4
  from transformers import pipeline
5
  import time, logging, re
6
  import matplotlib
7
- matplotlib.use('Agg') # Use non-interactive backend for headless environments
8
  import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
11
 
 
12
  logging.basicConfig(level=logging.ERROR)
13
  device = -1 # CPU-only
14
  print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
15
 
 
16
  try:
17
  summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
18
  except Exception as e:
19
- print(f"❌ Model loading failed: {str(e)}")
20
  exit(1)
21
 
 
 
 
 
 
 
 
 
22
  def visualize_chunk_status(chunk_data):
23
  status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
24
  labels = [f"C{i['chunk']}" for i in chunk_data]
@@ -34,9 +44,10 @@ def visualize_chunk_status(chunk_data):
34
  buf = io.BytesIO()
35
  plt.savefig(buf, format='png')
36
  buf.seek(0)
37
- plt.close(fig) # Release memory
38
  return Image.open(buf)
39
 
 
40
  def summarize_file(file_bytes):
41
  start = time.time()
42
  chunk_info = []
@@ -85,19 +96,56 @@ def summarize_file(file_bytes):
85
  image = visualize_chunk_status(chunk_info)
86
  return final_summary, image
87
 
88
- demo = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  fn=summarize_file,
90
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
91
  outputs=[
92
  gr.Textbox(label="πŸ“ Summarized Output"),
93
  gr.Image(label="πŸ“Š Visual Process Flow", type="pil")
94
  ],
95
- title="AI-Powered PDF Summarizer",
96
  description="Summarizes long PDFs (up to 300,000 characters) and visualizes chunk-level automation status."
97
  )
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  if __name__ == "__main__":
100
  try:
101
- demo.launch(share=False, server_port=7860)
 
 
 
102
  except Exception as e:
103
- print(f"❌ Gradio launch failed: {str(e)}")
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
  import torch
4
  from transformers import pipeline
5
  import time, logging, re
6
  import matplotlib
7
+ matplotlib.use('Agg')
8
  import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
11
 
12
+ # Logging and setup
13
  logging.basicConfig(level=logging.ERROR)
14
  device = -1 # CPU-only
15
  print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
16
 
17
+ # Load summarizer
18
  try:
19
  summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
20
  except Exception as e:
21
+ print(f"❌ Summarizer model loading failed: {str(e)}")
22
  exit(1)
23
 
24
+ # Load question-answering model
25
+ try:
26
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device=device)
27
+ except Exception as e:
28
+ print(f"❌ QA model loading failed: {str(e)}")
29
+ exit(1)
30
+
31
+ # Function: Visualize chunk processing status
32
  def visualize_chunk_status(chunk_data):
33
  status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
34
  labels = [f"C{i['chunk']}" for i in chunk_data]
 
44
  buf = io.BytesIO()
45
  plt.savefig(buf, format='png')
46
  buf.seek(0)
47
+ plt.close(fig)
48
  return Image.open(buf)
49
 
50
+ # Function: Summarization
51
  def summarize_file(file_bytes):
52
  start = time.time()
53
  chunk_info = []
 
96
  image = visualize_chunk_status(chunk_info)
97
  return final_summary, image
98
 
99
+ # Function: QA from PDF
100
+ def answer_question(file_bytes, question):
101
+ try:
102
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
103
+ text = "".join(page.get_text("text") for page in doc)
104
+ text = re.sub(r"\s+", " ", text).strip()
105
+ text = "".join(c for c in text if ord(c) < 128)
106
+ context = text[:300000]
107
+ except Exception as e:
108
+ return f"❌ Text extraction failed: {str(e)}"
109
+
110
+ if not question.strip():
111
+ return "⚠️ Please enter a valid question."
112
+
113
+ try:
114
+ result = qa_pipeline(question=question, context=context)
115
+ return f"**Answer**: {result['answer']}\n\n**Score**: {result['score']:.2f}"
116
+ except Exception as e:
117
+ return f"❌ QA failed: {str(e)}"
118
+
119
+ # Gradio UI for Summarizer
120
+ summarizer_ui = gr.Interface(
121
  fn=summarize_file,
122
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
123
  outputs=[
124
  gr.Textbox(label="πŸ“ Summarized Output"),
125
  gr.Image(label="πŸ“Š Visual Process Flow", type="pil")
126
  ],
127
+ title="πŸ“ AI-Powered PDF Summarizer",
128
  description="Summarizes long PDFs (up to 300,000 characters) and visualizes chunk-level automation status."
129
  )
130
 
131
+ # Gradio UI for Q&A
132
+ qa_ui = gr.Interface(
133
+ fn=answer_question,
134
+ inputs=[
135
+ gr.File(label="πŸ“„ Upload PDF", type="binary"),
136
+ gr.Textbox(label="❓ Ask a Question")
137
+ ],
138
+ outputs=gr.Textbox(label="πŸ” Answer"),
139
+ title="πŸ“š PDF Q&A Assistant",
140
+ description="Ask natural language questions based on the uploaded PDF content."
141
+ )
142
+
143
+ # Combine both in tabs
144
  if __name__ == "__main__":
145
  try:
146
+ gr.TabbedInterface(
147
+ [summarizer_ui, qa_ui],
148
+ ["πŸ“ Summarizer", "❓ Q&A Assistant"]
149
+ ).launch(server_port=7860)
150
  except Exception as e:
151
+ print(f"❌ Gradio launch failed: {str(e)}")