Spaces:

Azidan
/

textSum

Running

App Files Files Community

textSum / app.py

Azidan

Update app.py

bb331f0 verified about 21 hours ago

raw

history blame contribute delete

8.13 kB

	import gradio as gr
	import re
	from transformers import pipeline, AutoTokenizer
	from PyPDF2 import PdfReader
	import tempfile

	# =========================
	# Model setup (CPU-safe)
	# =========================
	# Use smaller, faster models to speed up processing
	MODEL_NAME = "sshleifer/distilbart-cnn-6-6" # Smaller than 12-6, faster on CPU
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	summarizer = pipeline(
	"summarization",
	model=MODEL_NAME,
	tokenizer=tokenizer,
	device=-1 # CPU only
	)

	# Use smaller flan-t5-small for faster advice generation
	advice_generator = pipeline(
	"text2text-generation",
	model="google/flan-t5-small",
	device=-1 # CPU only
	)

	CHUNK_SIZE = 900 # safe margin under typical max input

	# =========================
	# Utilities
	# =========================
	def clean_text(text: str) -> str:
	"""Fix quotes, spacing, repetition, broken punctuation."""
	text = text.replace("‘", "'").replace("’", "'")
	text = text.replace("“", '"').replace("”", '"')
	text = re.sub(r"[.]{2,}", ".", text)
	text = re.sub(r"[']{2,}", "'", text)
	text = re.sub(r"\s+", " ", text)
	sentences = re.split(r'(?<=[.!?])\s+', text)
	seen = set()
	result = []
	for s in sentences:
	key = s.strip().lower()
	if key and key not in seen:
	seen.add(key)
	result.append(s.strip())
	return " ".join(result)

	def chunk_text(text: str):
	"""Token-aware chunking to avoid model overflow."""
	tokens = tokenizer.encode(text, add_special_tokens=False)
	chunks = []
	for i in range(0, len(tokens), CHUNK_SIZE):
	chunk_tokens = tokens[i:i + CHUNK_SIZE]
	chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
	chunks.append(chunk_text)
	return chunks

	def generate_ai_advice(summary: str) -> str:
	"""Generate personalized study advice based on the paper summary."""
	truncated_summary = summary[:1000]

	prompt = (
	f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
	"Generate exactly 5 practical study tips for a student to better understand and retain this content. "
	"Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
	"Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
	"Output only the 5 tips as bullet points, nothing else."
	)

	generated = advice_generator(
	prompt,
	max_length=250,
	num_return_sequences=1,
	do_sample=False,
	temperature=0.7
	)[0]["generated_text"]

	# Try to clean into bullet points
	tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
	if not tips or len(tips) < 3:
	tips = [t.strip() for t in generated.split('.') if t.strip()]

	advice_md = "\n\n---\n\n### 📚 AI-Generated Study Tips\n\n"
	for i, tip in enumerate(tips[:5], 1):
	clean_tip = tip.lstrip('- ').strip()
	advice_md += f"- {clean_tip}\n"

	advice_md += "\nPro tip: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
	return advice_md

	def extract_possible_headings(text: str) -> str:
	"""Attempt to extract potential titles and subtitles from raw text.
	This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
	lines = text.split('\n')
	headings = []
	for line in lines:
	stripped = line.strip()
	if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
	headings.append(stripped)
	if headings:
	return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
	return ""

	def summarize_long_text(text: str, progress=gr.Progress()) -> str:
	"""Summarize long text in chunks + add AI study advice.
	Now with longer summaries per chunk and formatted as bullet points."""
	if not text or len(text.strip()) == 0:
	return "No text provided."

	progress(0, desc="Extracting headings...")
	# Extract possible headings first
	headings_section = extract_possible_headings(text)

	progress(0.1, desc="Chunking text...")
	chunks = chunk_text(text)

	summaries = []
	progress(0.2, desc="Summarizing chunks...")
	for i in progress.tqdm(range(len(chunks))):
	chunk = chunks[i]
	try:
	summary = summarizer(
	chunk,
	max_length=200, # Reduced slightly for speed (compromise between length and time)
	min_length=60, # Reduced for speed
	do_sample=False
	)[0]["summary_text"]
	cleaned = clean_text(summary)
	summaries.append(f"Chunk {i+1} Summary: {cleaned}")
	except Exception:
	pass # skip problematic chunks

	# Format summaries as bullet points
	summary_md = "### Detailed Summary (in Bullet Points)\n\n"
	for s in summaries:
	summary_md += f"- {s}\n"

	progress(0.8, desc="Generating AI advice...")
	ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation

	progress(1, desc="Done!")
	return headings_section + summary_md + ai_advice

	def read_pdf(file) -> str:
	"""Safely extract text from PDF."""
	try:
	reader = PdfReader(file)
	pages = [page.extract_text() or "" for page in reader.pages]
	return "\n".join(pages) # Join with newlines to preserve line breaks for heading detection
	except Exception as e:
	return f"PDF read error: {str(e)}"

	# =========================
	# Download helper
	# =========================
	def create_download_file(content: str) -> str:
	"""Create temporary file for Gradio file download component"""
	with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
	tmp.write(content)
	return tmp.name

	# =========================
	# Main handler
	# =========================
	def process_input(text: str, file, progress=gr.Progress()):
	input_text = ""

	progress(0, desc="Reading input...")
	if file is not None:
	input_text = read_pdf(file)
	elif text.strip():
	input_text = text
	else:
	return "Please paste some text or upload a PDF.", None

	result = summarize_long_text(input_text, progress)
	download_path = create_download_file(result)

	return result, download_path

	# =========================
	# Gradio UI
	# =========================
	with gr.Blocks() as demo:
	gr.Markdown("# 📄 Long Text Summarizer + AI Study Assistant")
	gr.Markdown(
	"• Handles very long documents (thousands of words)\n"
	"• Supports PDF upload or direct paste\n"
	"• Runs on CPU – works on free hardware\n"
	"• Gives you longer, bullet-point summaries with possible headings/subtitles\n"
	"• Includes 5 AI-generated study tips tailored to the content\n"
	"• Download result as .txt file\n"
	"Note: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
	)

	with gr.Row():
	text_input = gr.Textbox(
	lines=10,
	label="Paste your text here (optional)",
	placeholder="Paste lecture notes, article, book chapter...",
	)
	file_input = gr.File(
	label="Or upload a PDF",
	file_types=[".pdf"]
	)

	summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")

	output = gr.Textbox(
	lines=16,
	label="Summary + AI-generated study advice",
	interactive=False
	)

	download_output = gr.File(
	label="Download full result (.txt)",
	interactive=False
	)

	summarize_btn.click(
	fn=process_input,
	inputs=[text_input, file_input],
	outputs=[output, download_output]
	)

	demo.launch()