example_five / app.py
tejovanth's picture
Update app.py
40cfd28 verified
raw
history blame
5.15 kB
import gradio as gr
import fitz # PyMuPDF
import torch
from transformers import pipeline
import time, logging, re
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import io
from PIL import Image
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
# Load summarizer
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"❌ Summarizer model loading failed: {str(e)}")
exit(1)
# Load better QA model
try:
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=device)
except Exception as e:
print(f"❌ QA model loading failed: {str(e)}")
exit(1)
def visualize_chunk_status(chunk_data):
status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
labels = [f"C{i['chunk']}" for i in chunk_data]
colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
times = [i.get('time', 0.1) for i in chunk_data]
fig, ax = plt.subplots(figsize=(10, 2.5))
ax.barh(labels, times, color=colors)
ax.set_xlabel("Time (s)")
ax.set_title("πŸ“Š Chunk Processing Status")
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close(fig)
return Image.open(buf)
def summarize_file(file_bytes):
start = time.time()
chunk_info = []
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
text = re.sub(r"\\cap", "intersection", text)
text = re.sub(r"\s+", " ", text).strip()
text = "".join(c for c in text if ord(c) < 128)
except Exception as e:
return f"❌ Text extraction failed: {str(e)}", None
if not text.strip():
return "❌ No text found", None
text = text[:300000]
chunks = [text[i:i+2000] for i in range(0, len(text), 2000)][:3] # Limit to 3 chunks for testing
summaries = []
for i, chunk in enumerate(chunks):
chunk_start = time.time()
chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
if time.time() - start > 20:
summaries.append("⚠️ Stopped early")
break
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
chunk_result['status'] = 'skipped'
else:
try:
summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
summaries.append(f"### Chunk {i+1}\n{summary}")
chunk_result['status'] = 'summarized'
except Exception as e:
summaries.append(f"### Chunk {i+1}: ❌ Error: {str(e)}")
chunk_result['status'] = 'error'
chunk_result['time'] = time.time() - chunk_start
chunk_info.append(chunk_result)
formatted_chunks = "\n\n---\n\n".join(summaries)
final_summary = f"""**Characters Processed**: {len(text)}
**Total Time**: {time.time()-start:.2f} seconds
## πŸ”Ή Summary by Chunks
{formatted_chunks}
"""
image = visualize_chunk_status(chunk_info)
return final_summary, image
def answer_question(file_bytes, question):
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
text = re.sub(r"\s+", " ", text).strip()
text = "".join(c for c in text if ord(c) < 128)
context = text[:300000]
except Exception as e:
return f"❌ Text extraction failed: {str(e)}"
if not question.strip():
return "⚠️ Please enter a valid question."
try:
result = qa_pipeline(question=question, context=context)
return f"**Answer**: {result['answer']}\n\n**Score**: {result['score']:.2f}"
except Exception as e:
return f"❌ QA failed: {str(e)}"
# Summarizer UI
summarizer_ui = gr.Interface(
fn=summarize_file,
inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
outputs=[
gr.Textbox(label="πŸ“ Summarized Output", lines=30, show_copy_button=True),
gr.Image(label="πŸ“Š Visual Process Flow", type="pil")
],
title="πŸ“ AI-Powered PDF Summarizer",
description="Summarizes long PDFs and visualizes chunk-level processing (limited to 3 chunks for testing)."
)
# Q&A UI
qa_ui = gr.Interface(
fn=answer_question,
inputs=[
gr.File(label="πŸ“„ Upload PDF", type="binary"),
gr.Textbox(label="❓ Ask a Question")
],
outputs=gr.Textbox(label="πŸ” Answer"),
title="πŸ“š PDF Q&A Assistant",
description="Ask natural language questions from the uploaded PDF."
)
# Tabs
if __name__ == "__main__":
try:
gr.TabbedInterface(
[summarizer_ui, qa_ui],
["πŸ“ Summarizer", "❓ Q&A Assistant"]
).launch(server_port=7860)
except Exception as e:
print(f"❌ Gradio launch failed: {str(e)}")