examplefour / app.py
tejovanth's picture
Create app.py
fcfdebb verified
raw
history blame
3.47 kB
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re
import matplotlib.pyplot as plt
import io
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"❌ Model loading failed: {str(e)}")
exit(1)
def visualize_chunk_status(chunk_data):
status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
labels = [f"C{i['chunk']}" for i in chunk_data]
colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
times = [i.get('time', 0.1) for i in chunk_data] # Avoid zero-time bars
fig, ax = plt.subplots(figsize=(10, 2.5))
ax.barh(labels, times, color=colors)
ax.set_xlabel("Time (s)")
ax.set_title("πŸ“Š Chunk Processing Status")
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
return buf
def summarize_file(file_bytes):
start = time.time()
chunk_info = []
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
text = re.sub(r"\\cap", "intersection", text)
text = re.sub(r"\s+", " ", text).strip()
text = "".join(c for c in text if ord(c) < 128)
except Exception as e:
return f"❌ Text extraction failed: {str(e)}", None
if not text.strip():
return "❌ No text found", None
text = text[:300000]
chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
summaries = []
for i, chunk in enumerate(chunks):
chunk_start = time.time()
chunk_result = {'chunk': i+1, 'status': '', 'time': 0}
if time.time() - start > 20:
summaries.append("⚠️ Stopped early")
break
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
chunk_result['status'] = 'skipped'
else:
try:
summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
summaries.append(f"**Chunk {i+1}**:\n{summary}")
chunk_result['status'] = 'summarized'
except Exception as e:
summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
chunk_result['status'] = 'error'
chunk_result['time'] = time.time() - chunk_start
chunk_info.append(chunk_result)
final_summary = f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
image_buf = visualize_chunk_status(chunk_info)
return final_summary, image_buf
demo = gr.Interface(
fn=summarize_file,
inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
outputs=[
gr.Textbox(label="πŸ“ Summarized Output"),
gr.Image(label="πŸ“Š Visual Process Flow")
],
title="AI-Powered PDF Summarizer",
description="Summarizes long PDFs (up to 300,000 characters) and visualizes chunk-level automation status."
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"❌ Gradio launch failed: {str(e)}")