Spaces:

HF-Pawan
/

Document-Summarization

Running

File size: 4,194 Bytes

901814c

# app.py (Main Gradio Application for HF Spaces)
# This is ready for Hugging Face Spaces deployment.
# Set HUGGINGFACE_HUB_TOKEN as a Space secret.

import os
import gradio as gr
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import ChatPromptTemplate
import PyPDF2
from docx import Document
from dotenv import load_dotenv

load_dotenv()

# LLM Setup
token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not token:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN not set. Please configure it in HF Spaces secrets.")

llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2.5-7B-Instruct",
    task="text-generation",
    temperature=0.3,
    top_p=0.9,
    max_new_tokens=400,
    huggingfacehub_api_token=token,
)

chat_model = ChatHuggingFace(llm=llm)

# Summarization Prompt
SUMMARIZE_PROMPT = ChatPromptTemplate.from_messages([
    ("system", """You are a highly capable document summarization assistant.
Write a clear, concise summary of the provided document.
Focus on the main ideas, key facts, arguments and conclusions.
Use neutral language. Avoid adding information not present in the text.
Aim for 150–350 words depending on document length."""),
    ("human", "{text}\n\nPlease provide a comprehensive yet concise summary."),
])

summarize_chain = SUMMARIZE_PROMPT | chat_model

# File Extraction Function
def extract_text(file_path: str) -> str:
    ext = os.path.splitext(file_path)[1].lower()
    
    try:
        if ext == ".txt":
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read().strip()
                
        elif ext == ".pdf":
            text = ""
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    page_text = page.extract_text() or ""
                    text += page_text + "\n"
            return text.strip()
            
        elif ext == ".docx":
            doc = Document(file_path)
            return "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
            
        else:
            return "❌ Supported formats: .txt, .pdf, .docx"
            
    except Exception as e:
        return f"Error reading file: {str(e)}"

# Summarization Function
def summarize_document(file):
    if not file:
        return "Please upload a document."
    
    text = extract_text(file.name)
    
    if text.startswith("❌") or text.startswith("Error"):
        return text
    
    if len(text.strip()) < 80:
        return "Not enough meaningful text extracted."
    
    # Truncate long texts to avoid timeouts
    if len(text) > 18000:
        text = text[:18000]
        warning = "⚠️ Document truncated to ~18k characters for processing.\n\n"
    else:
        warning = ""
    
    try:
        response = summarize_chain.invoke({"text": text})
        summary = response.content.strip()
        return warning + summary if summary else "No summary generated."
    
    except Exception as e:
        err = str(e).lower()
        if "token" in err or "authorization" in err:
            return "❌ Hugging Face token invalid or missing."
        if "rate limit" in err:
            return "❌ Rate limit reached. Try later."
        return f"❌ Error: {str(e)}"

# Gradio Interface
with gr.Blocks(title="Document Summarizer") as demo:
    gr.Markdown("# 📄 Document Summarizer")
    gr.Markdown("Upload TXT, PDF, or DOCX and get an AI summary using Qwen2.5-7B-Instruct via Hugging Face.")
    
    file_input = gr.File(
        label="Upload Document",
        file_types=[".txt", ".pdf", ".docx"],
        type="filepath"
    )
    
    btn = gr.Button("Generate Summary", variant="primary")
    
    output = gr.Textbox(
        label="Summary",
        lines=14,
        placeholder="Summary will appear here..."
    )
    
    btn.click(
        fn=summarize_document,
        inputs=file_input,
        outputs=output
    )
    
    gr.Markdown("""
    **Notes**: 
    - Powered by Hugging Face Inference API.
    - Free tier has rate limits.
    """)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)