examplethree / app.py
tejovanth's picture
Update app.py
47a8c64 verified
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
from PIL import Image
from io import BytesIO
from striprtf.striprtf import rtf_to_text
import tempfile
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("⚠️ CPU-only. Expect ~5–9s for 300,000 chars.")
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"❌ Model loading failed: {str(e)}")
exit(1)
def summarize_file_bytes(file_bytes, filename):
start = time.time()
try:
if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
return f"❌ {filename}: Invalid or empty file", ""
mime, _ = mimetypes.guess_type(filename) or ('text/plain', None)
text = ""
if mime == 'application/pdf':
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
elif mime in ['text/plain', 'text/rtf']:
text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
elif mime in ['text/csv', 'application/vnd.ms-excel']:
text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
doc = docx.Document(BytesIO(file_bytes))
text = " ".join(p.text for p in doc.paragraphs if p.text)
elif mime in ['image/jpeg', 'image/png']:
img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
text = pytesseract.image_to_string(img)
elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
text = " ".join(df.astype(str).values.flatten())
else:
text = textract.process(file_bytes).decode("utf-8", errors="ignore")
text = re.sub(r"[^\x20-\x7E]", "", text) # Printable ASCII only
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
text = re.sub(r"\\cap", "intersection", text)
text = re.sub(r"\s+", " ", text).strip()
if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
return f"❌ {filename}: Invalid or too short text", ""
print(f"Extracted chars for {filename}: {len(text)}")
except Exception as e:
return f"❌ {filename}: Text extraction failed: {str(e)}", ""
text = text[:300000]
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
print(f"Chunks for {filename}: {len(chunks)}")
if not chunks:
return f"❌ {filename}: No chunks to summarize", ""
selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks)))
summaries = []
for i in selected_indices:
chunk = chunks[i]
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7:
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
continue
try:
summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
summaries.append(f"**Chunk {i+1}**:\n{summary}")
except Exception as e:
summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
while len(summaries) < 12:
summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content")
summary_text = f"πŸ“„ **{filename}**\n**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
return summary_text, summary_text
def summarize_multiple_files(*file_objs):
if not file_objs or not any(file_objs):
return "❌ No files uploaded", None
all_summaries = []
combined_text = ""
for file in file_objs[0] if isinstance(file_objs[0], list) else file_objs:
if not hasattr(file, 'read') or not hasattr(file, 'name'):
all_summaries.append(f"❌ Invalid file: Missing read() or name")
continue
filename = file.name.split("/")[-1]
file_bytes = file.read()
summary, raw_text = summarize_file_bytes(file_bytes, filename)
all_summaries.append(summary)
combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
f.write(combined_text)
summary_file_path = f.name
return "\n\n".join(all_summaries), summary_file_path
demo = gr.Interface(
fn=summarize_multiple_files,
inputs=gr.File(label="πŸ“„ Upload Any File", type="binary", file_count="multiple"),
outputs=[
gr.Textbox(label="πŸ“ Summary", lines=15, max_lines=100),
gr.File(label="πŸ“₯ Download Summary as .txt")
],
title="πŸ“š Multi-File Summarizer",
description="Summarizes any file into exactly 15 lines. Download as .txt. ~5–9s for 300,000 chars (CPU)."
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"❌ Gradio launch failed: {str(e)}")