Spaces:

HF-Pawan
/

Document-Summarization

Running

App Files Files Community

anyonehomep1mane commited on Jan 29

Commit

86ef765

0 Parent(s):

Initial Changes

Browse files

Files changed (4) hide show

.gitignore +3 -0
Dockerfile +25 -0
app.py +136 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.env
+venv
+.vscode

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Dockerfile (for custom HF Spaces deployment)
+# Use: In HF Spaces, select "Docker" runtime and upload this Dockerfile.
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies (if needed for any libs)
+RUN apt-get update && apt-get install -y \
+    libmagic1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app.py .
+# Expose Gradio port
+EXPOSE 7860
+# Run the app
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# app.py (Main Gradio Application for HF Spaces)
+# This is ready for Hugging Face Spaces deployment.
+# Set HUGGINGFACE_HUB_TOKEN as a Space secret.
+import os
+import gradio as gr
+from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
+from langchain_core.prompts import ChatPromptTemplate
+import PyPDF2
+from docx import Document
+from dotenv import load_dotenv
+load_dotenv()
+# LLM Setup
+token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+if not token:
+    raise ValueError("HUGGINGFACEHUB_API_TOKEN not set. Please configure it in HF Spaces secrets.")
+llm = HuggingFaceEndpoint(
+    repo_id="Qwen/Qwen2.5-7B-Instruct",
+    task="text-generation",
+    temperature=0.3,
+    top_p=0.9,
+    max_new_tokens=400,
+    huggingfacehub_api_token=token,
+)
+chat_model = ChatHuggingFace(llm=llm)
+# Summarization Prompt
+SUMMARIZE_PROMPT = ChatPromptTemplate.from_messages([
+    ("system", """You are a highly capable document summarization assistant.
+Write a clear, concise summary of the provided document.
+Focus on the main ideas, key facts, arguments and conclusions.
+Use neutral language. Avoid adding information not present in the text.
+Aim for 150–350 words depending on document length."""),
+    ("human", "{text}\n\nPlease provide a comprehensive yet concise summary."),
+])
+summarize_chain = SUMMARIZE_PROMPT | chat_model
+# File Extraction Function
+def extract_text(file_path: str) -> str:
+    ext = os.path.splitext(file_path)[1].lower()
+    try:
+        if ext == ".txt":
+            with open(file_path, "r", encoding="utf-8") as f:
+                return f.read().strip()
+        elif ext == ".pdf":
+            text = ""
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    page_text = page.extract_text() or ""
+                    text += page_text + "\n"
+            return text.strip()
+        elif ext == ".docx":
+            doc = Document(file_path)
+            return "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
+        else:
+            return "❌ Supported formats: .txt, .pdf, .docx"
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+# Summarization Function
+def summarize_document(file):
+    if not file:
+        return "Please upload a document."
+    text = extract_text(file.name)
+    if text.startswith("❌") or text.startswith("Error"):
+        return text
+    if len(text.strip()) < 80:
+        return "Not enough meaningful text extracted."
+    # Truncate long texts to avoid timeouts
+    if len(text) > 18000:
+        text = text[:18000]
+        warning = "⚠️ Document truncated to ~18k characters for processing.\n\n"
+    else:
+        warning = ""
+    try:
+        response = summarize_chain.invoke({"text": text})
+        summary = response.content.strip()
+        return warning + summary if summary else "No summary generated."
+    except Exception as e:
+        err = str(e).lower()
+        if "token" in err or "authorization" in err:
+            return "❌ Hugging Face token invalid or missing."
+        if "rate limit" in err:
+            return "❌ Rate limit reached. Try later."
+        return f"❌ Error: {str(e)}"
+# Gradio Interface
+with gr.Blocks(title="Document Summarizer") as demo:
+    gr.Markdown("# 📄 Document Summarizer")
+    gr.Markdown("Upload TXT, PDF, or DOCX and get an AI summary using Qwen2.5-7B-Instruct via Hugging Face.")
+    file_input = gr.File(
+        label="Upload Document",
+        file_types=[".txt", ".pdf", ".docx"],
+        type="filepath"
+    )
+    btn = gr.Button("Generate Summary", variant="primary")
+    output = gr.Textbox(
+        label="Summary",
+        lines=14,
+        placeholder="Summary will appear here..."
+    )
+    btn.click(
+        fn=summarize_document,
+        inputs=file_input,
+        outputs=output
+    )
+    gr.Markdown("""
+    **Notes**:
+    - Powered by Hugging Face Inference API.
+    - Free tier has rate limits.
+    """)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+langchain
+langchain_core
+langchain-huggingface
+PyPDF2
+python-docx
+dotenv