examplefour / app.py
tejovanth's picture
Update app.py
2d57a07 verified
raw
history blame
5.35 kB
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import io
from PIL import Image
import nltk
# Download punkt tokenizer if not already
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("⚠️ Optimized CPU-only version.")
try:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
except Exception as e:
print(f"❌ Model loading failed: {str(e)}")
exit(1)
def visualize_chunk_status(chunk_data):
status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
labels = [f"C{i['chunk']}" for i in chunk_data]
colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
times = [i.get('time', 0.1) for i in chunk_data]
fig, ax = plt.subplots(figsize=(10, 2.5))
ax.barh(labels, times, color=colors)
ax.set_xlabel("Time (s)")
ax.set_title("πŸ“Š Chunk Processing Status")
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close(fig)
return Image.open(buf)
def create_summary_flowchart(summaries):
# Filter only successful summaries
filtered = []
for s in summaries:
if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s:
parts = s.split("**:", 1)
if len(parts) > 1:
filtered.append(parts[1].strip())
if not filtered:
return None
fig_height = max(2, len(filtered) * 1.5)
fig, ax = plt.subplots(figsize=(6, fig_height))
ax.axis('off')
ypos = list(range(len(filtered) * 2, 0, -2))
boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
for i, (y, summary) in enumerate(zip(ypos, filtered)):
summary_text = summary.replace("\n", " ").strip()[:120]
if len(summary_text) == 120:
summary_text += "..."
ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9)
if i < len(filtered) - 1:
ax.annotate('', xy=(0.5, y - 1.5), xytext=(0.5, y - 0.5),
arrowprops=dict(arrowstyle="->", lw=1.5))
buf = io.BytesIO()
plt.tight_layout()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close(fig)
return Image.open(buf)
def split_text_into_chunks(text, max_tokens=1500):
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_tokens:
current_chunk += " " + sentence
else:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks[:20] # Limit to 20 chunks max
def summarize_file(file_bytes):
start = time.time()
chunk_info = []
summaries = []
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
text = re.sub(r"\\cap", "intersection", text)
text = re.sub(r"\s+", " ", text).strip()
text = "".join(c for c in text if ord(c) < 128)
except Exception as e:
return f"❌ Text extraction failed: {str(e)}", None, None
if not text.strip():
return "❌ No text found", None, None
chunks = split_text_into_chunks(text)
for i, chunk in enumerate(chunks):
chunk_start = time.time()
chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
chunk_result['status'] = 'skipped'
else:
try:
summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
summaries.append(f"**Chunk {i+1}**:\n{summary}")
chunk_result['status'] = 'summarized'
except Exception as e:
summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
chunk_result['status'] = 'error'
chunk_result['time'] = time.time() - chunk_start
chunk_info.append(chunk_result)
final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
process_img = visualize_chunk_status(chunk_info)
flow_img = create_summary_flowchart(summaries)
return final_summary, process_img, flow_img
demo = gr.Interface(
fn=summarize_file,
inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
outputs=[
gr.Textbox(label="πŸ“ Summary"),
gr.Image(label="πŸ“Š Chunk Status", type="pil"),
gr.Image(label="πŸ” Flow Summary", type="pil")
],
title="πŸ“˜ PDF Summarizer with Visual Flow",
description="Summarizes up to 30,000 characters from a PDF. Includes chunk status and flowchart visualizations."
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"❌ Gradio launch failed: {str(e)}")