Spaces:

HF-Pawan
/

Document-Summarization

Running

Document-Summarization / app_version_1.py

anyonehomep1mane

Modular Changes, UI changes

901814c 30 days ago

4.19 kB

	# app.py (Main Gradio Application for HF Spaces)
	# This is ready for Hugging Face Spaces deployment.
	# Set HUGGINGFACE_HUB_TOKEN as a Space secret.

	import os
	import gradio as gr
	from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
	from langchain_core.prompts import ChatPromptTemplate
	import PyPDF2
	from docx import Document
	from dotenv import load_dotenv

	load_dotenv()

	# LLM Setup
	token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
	if not token:
	raise ValueError("HUGGINGFACEHUB_API_TOKEN not set. Please configure it in HF Spaces secrets.")

	llm = HuggingFaceEndpoint(
	repo_id="Qwen/Qwen2.5-7B-Instruct",
	task="text-generation",
	temperature=0.3,
	top_p=0.9,
	max_new_tokens=400,
	huggingfacehub_api_token=token,
	)

	chat_model = ChatHuggingFace(llm=llm)

	# Summarization Prompt
	SUMMARIZE_PROMPT = ChatPromptTemplate.from_messages([
	("system", """You are a highly capable document summarization assistant.
	Write a clear, concise summary of the provided document.
	Focus on the main ideas, key facts, arguments and conclusions.
	Use neutral language. Avoid adding information not present in the text.
	Aim for 150–350 words depending on document length."""),
	("human", "{text}\n\nPlease provide a comprehensive yet concise summary."),
	])

	summarize_chain = SUMMARIZE_PROMPT \| chat_model

	# File Extraction Function
	def extract_text(file_path: str) -> str:
	ext = os.path.splitext(file_path)[1].lower()

	try:
	if ext == ".txt":
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read().strip()

	elif ext == ".pdf":
	text = ""
	with open(file_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	page_text = page.extract_text() or ""
	text += page_text + "\n"
	return text.strip()

	elif ext == ".docx":
	doc = Document(file_path)
	return "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()

	else:
	return "❌ Supported formats: .txt, .pdf, .docx"

	except Exception as e:
	return f"Error reading file: {str(e)}"

	# Summarization Function
	def summarize_document(file):
	if not file:
	return "Please upload a document."

	text = extract_text(file.name)

	if text.startswith("❌") or text.startswith("Error"):
	return text

	if len(text.strip()) < 80:
	return "Not enough meaningful text extracted."

	# Truncate long texts to avoid timeouts
	if len(text) > 18000:
	text = text[:18000]
	warning = "⚠️ Document truncated to ~18k characters for processing.\n\n"
	else:
	warning = ""

	try:
	response = summarize_chain.invoke({"text": text})
	summary = response.content.strip()
	return warning + summary if summary else "No summary generated."

	except Exception as e:
	err = str(e).lower()
	if "token" in err or "authorization" in err:
	return "❌ Hugging Face token invalid or missing."
	if "rate limit" in err:
	return "❌ Rate limit reached. Try later."
	return f"❌ Error: {str(e)}"

	# Gradio Interface
	with gr.Blocks(title="Document Summarizer") as demo:
	gr.Markdown("# 📄 Document Summarizer")
	gr.Markdown("Upload TXT, PDF, or DOCX and get an AI summary using Qwen2.5-7B-Instruct via Hugging Face.")

	file_input = gr.File(
	label="Upload Document",
	file_types=[".txt", ".pdf", ".docx"],
	type="filepath"
	)

	btn = gr.Button("Generate Summary", variant="primary")

	output = gr.Textbox(
	label="Summary",
	lines=14,
	placeholder="Summary will appear here..."
	)

	btn.click(
	fn=summarize_document,
	inputs=file_input,
	outputs=output
	)

	gr.Markdown("""
	Notes:
	- Powered by Hugging Face Inference API.
	- Free tier has rate limits.
	""")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)