Spaces:

tejovanth
/

examplethree

Sleeping

App Files Files Community

examplethree / app.py

tejovanth

Create app.py

3930fe6 verified 11 months ago

raw

history blame

3.96 kB

	import gradio as gr
	import fitz
	import torch
	from transformers import pipeline
	import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes
	from PIL import Image
	from io import BytesIO
	from striprtf.striprtf import rtf_to_text

	logging.basicConfig(level=logging.ERROR)
	device = -1 # CPU-only
	print("⚠️ CPU-only. Expect ~10–15s for 300,000 chars.")

	try:
	summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
	except Exception as e:
	print(f"❌ Model loading failed: {str(e)}")
	exit(1)

	def summarize_file(file):
	start = time.time()
	print(f"File: {file.name if hasattr(file, 'name') else 'unknown'}")
	try:
	file_bytes = file.read() if hasattr(file, 'read') else file
	mime, _ = mimetypes.guess_type(file.name) if hasattr(file, 'name') else (None, None)
	text = ""
	if mime == 'application/pdf':
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	text = "".join(page.get_text("text") for page in doc)
	elif mime in ['text/plain', 'text/rtf']:
	text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
	elif mime in ['text/csv', 'application/vnd.ms-excel']:
	text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten())
	elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
	doc = docx.Document(BytesIO(file_bytes))
	text = " ".join(p.text for p in doc.paragraphs if p.text)
	elif mime in ['image/jpeg', 'image/png']:
	img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300))
	text = pytesseract.image_to_string(img)
	elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
	df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl')
	text = " ".join(df.astype(str).values.flatten())
	else:
	text = textract.process(file_bytes).decode("utf-8", errors="ignore")
	text = re.sub(r"\$\s([^$]+)\s\$", r"\1", text)
	text = re.sub(r"\\cap", "intersection", text)
	text = re.sub(r"\s+", " ", text).strip()
	text = "".join(c for c in text if ord(c) < 128)
	print(f"Extracted chars: {len(text)}")
	except Exception as e:
	return f"❌ Text extraction failed: {str(e)}"
	if not text.strip(): return "❌ No text found"
	text = text[:300000]
	chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
	print(f"Chunks created: {len(chunks)}")
	if not chunks: return "❌ No chunks to summarize"
	summaries = []
	for i in range(0, len(chunks), 4):
	if time.time() - start > 15:
	summaries.append("⚠️ Stopped early")
	break
	batch = chunks[i:i+4]
	if any(sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7 for chunk in batch):
	summaries.append(f"Chunk {i+1}–{i+len(batch)}: Skipped (equation-heavy)")
	continue
	try:
	results = summarizer(batch, max_length=50, min_length=10, do_sample=False)
	summaries.extend(f"Chunk {i+j+1}:\n{r['summary_text']}" for j, r in enumerate(results))
	except Exception as e:
	summaries.append(f"Chunk {i+1}–{i+len(batch)}: ❌ Error: {str(e)}")
	return f"Chars: {len(text)}\nTime: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)

	demo = gr.Interface(
	fn=summarize_file, inputs=gr.File(label="📄 Any File", type="file"),
	outputs=gr.Textbox(label="📝 Summary"),
	title="Fast Summarizer", description="300,000+ chars in ~10–15s (CPU)"
	)

	if __name__ == "__main__":
	try:
	demo.launch(share=False, server_port=7860)
	except Exception as e:
	print(f"❌ Gradio launch failed: {str(e)}")