# app.py (Main Gradio Application for HF Spaces) # This is ready for Hugging Face Spaces deployment. # Set HUGGINGFACE_HUB_TOKEN as a Space secret. import os import gradio as gr from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace from langchain_core.prompts import ChatPromptTemplate import PyPDF2 from docx import Document from dotenv import load_dotenv load_dotenv() # LLM Setup token = os.getenv("HUGGINGFACEHUB_API_TOKEN") if not token: raise ValueError("HUGGINGFACEHUB_API_TOKEN not set. Please configure it in HF Spaces secrets.") llm = HuggingFaceEndpoint( repo_id="Qwen/Qwen2.5-7B-Instruct", task="text-generation", temperature=0.3, top_p=0.9, max_new_tokens=400, huggingfacehub_api_token=token, ) chat_model = ChatHuggingFace(llm=llm) # Summarization Prompt SUMMARIZE_PROMPT = ChatPromptTemplate.from_messages([ ("system", """You are a highly capable document summarization assistant. Write a clear, concise summary of the provided document. Focus on the main ideas, key facts, arguments and conclusions. Use neutral language. Avoid adding information not present in the text. Aim for 150–350 words depending on document length."""), ("human", "{text}\n\nPlease provide a comprehensive yet concise summary."), ]) summarize_chain = SUMMARIZE_PROMPT | chat_model # File Extraction Function def extract_text(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() try: if ext == ".txt": with open(file_path, "r", encoding="utf-8") as f: return f.read().strip() elif ext == ".pdf": text = "" with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: page_text = page.extract_text() or "" text += page_text + "\n" return text.strip() elif ext == ".docx": doc = Document(file_path) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip() else: return "❌ Supported formats: .txt, .pdf, .docx" except Exception as e: return f"Error reading file: {str(e)}" # Summarization Function def summarize_document(file): if not file: return "Please upload a document." text = extract_text(file.name) if text.startswith("❌") or text.startswith("Error"): return text if len(text.strip()) < 80: return "Not enough meaningful text extracted." # Truncate long texts to avoid timeouts if len(text) > 18000: text = text[:18000] warning = "⚠️ Document truncated to ~18k characters for processing.\n\n" else: warning = "" try: response = summarize_chain.invoke({"text": text}) summary = response.content.strip() return warning + summary if summary else "No summary generated." except Exception as e: err = str(e).lower() if "token" in err or "authorization" in err: return "❌ Hugging Face token invalid or missing." if "rate limit" in err: return "❌ Rate limit reached. Try later." return f"❌ Error: {str(e)}" # Gradio Interface with gr.Blocks(title="Document Summarizer") as demo: gr.Markdown("# 📄 Document Summarizer") gr.Markdown("Upload TXT, PDF, or DOCX and get an AI summary using Qwen2.5-7B-Instruct via Hugging Face.") file_input = gr.File( label="Upload Document", file_types=[".txt", ".pdf", ".docx"], type="filepath" ) btn = gr.Button("Generate Summary", variant="primary") output = gr.Textbox( label="Summary", lines=14, placeholder="Summary will appear here..." ) btn.click( fn=summarize_document, inputs=file_input, outputs=output ) gr.Markdown(""" **Notes**: - Powered by Hugging Face Inference API. - Free tier has rate limits. """) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)