examplethree / app.py
tejovanth's picture
Create app.py
3930fe6 verified
raw
history blame
3.96 kB
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
from PIL import Image
from io import BytesIO
from striprtf.striprtf import rtf_to_text
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("⚠️ CPU-only. Expect ~10–15s for 300,000 chars.")
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"❌ Model loading failed: {str(e)}")
exit(1)
def summarize_file(file):
start = time.time()
print(f"File: {file.name if hasattr(file, 'name') else 'unknown'}")
try:
file_bytes = file.read() if hasattr(file, 'read') else file
mime, _ = mimetypes.guess_type(file.name) if hasattr(file, 'name') else (None, None)
text = ""
if mime == 'application/pdf':
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
elif mime in ['text/plain', 'text/rtf']:
text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
elif mime in ['text/csv', 'application/vnd.ms-excel']:
text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
doc = docx.Document(BytesIO(file_bytes))
text = " ".join(p.text for p in doc.paragraphs if p.text)
elif mime in ['image/jpeg', 'image/png']:
img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
text = pytesseract.image_to_string(img)
elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
text = " ".join(df.astype(str).values.flatten())
else:
text = textract.process(file_bytes).decode("utf-8", errors="ignore")
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
text = re.sub(r"\\cap", "intersection", text)
text = re.sub(r"\s+", " ", text).strip()
text = "".join(c for c in text if ord(c) < 128)
print(f"Extracted chars: {len(text)}")
except Exception as e:
return f"❌ Text extraction failed: {str(e)}"
if not text.strip(): return "❌ No text found"
text = text[:300000]
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
print(f"Chunks created: {len(chunks)}")
if not chunks: return "❌ No chunks to summarize"
summaries = []
for i in range(0, len(chunks), 4):
if time.time() - start > 15:
summaries.append("⚠️ Stopped early")
break
batch = chunks[i:i+4]
if any(sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7 for chunk in batch):
summaries.append(f"**Chunk {i+1}–{i+len(batch)}**: Skipped (equation-heavy)")
continue
try:
results = summarizer(batch, max_length=50, min_length=10, do_sample=False)
summaries.extend(f"**Chunk {i+j+1}**:\n{r['summary_text']}" for j, r in enumerate(results))
except Exception as e:
summaries.append(f"**Chunk {i+1}–{i+len(batch)}**: ❌ Error: {str(e)}")
return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
demo = gr.Interface(
fn=summarize_file, inputs=gr.File(label="πŸ“„ Any File", type="file"),
outputs=gr.Textbox(label="πŸ“ Summary"),
title="Fast Summarizer", description="300,000+ chars in ~10–15s (CPU)"
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"❌ Gradio launch failed: {str(e)}")