Spaces:
Running
Running
File size: 4,194 Bytes
901814c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | # app.py (Main Gradio Application for HF Spaces)
# This is ready for Hugging Face Spaces deployment.
# Set HUGGINGFACE_HUB_TOKEN as a Space secret.
import os
import gradio as gr
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import ChatPromptTemplate
import PyPDF2
from docx import Document
from dotenv import load_dotenv
load_dotenv()
# LLM Setup
token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not token:
raise ValueError("HUGGINGFACEHUB_API_TOKEN not set. Please configure it in HF Spaces secrets.")
llm = HuggingFaceEndpoint(
repo_id="Qwen/Qwen2.5-7B-Instruct",
task="text-generation",
temperature=0.3,
top_p=0.9,
max_new_tokens=400,
huggingfacehub_api_token=token,
)
chat_model = ChatHuggingFace(llm=llm)
# Summarization Prompt
SUMMARIZE_PROMPT = ChatPromptTemplate.from_messages([
("system", """You are a highly capable document summarization assistant.
Write a clear, concise summary of the provided document.
Focus on the main ideas, key facts, arguments and conclusions.
Use neutral language. Avoid adding information not present in the text.
Aim for 150β350 words depending on document length."""),
("human", "{text}\n\nPlease provide a comprehensive yet concise summary."),
])
summarize_chain = SUMMARIZE_PROMPT | chat_model
# File Extraction Function
def extract_text(file_path: str) -> str:
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".txt":
with open(file_path, "r", encoding="utf-8") as f:
return f.read().strip()
elif ext == ".pdf":
text = ""
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text + "\n"
return text.strip()
elif ext == ".docx":
doc = Document(file_path)
return "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
else:
return "β Supported formats: .txt, .pdf, .docx"
except Exception as e:
return f"Error reading file: {str(e)}"
# Summarization Function
def summarize_document(file):
if not file:
return "Please upload a document."
text = extract_text(file.name)
if text.startswith("β") or text.startswith("Error"):
return text
if len(text.strip()) < 80:
return "Not enough meaningful text extracted."
# Truncate long texts to avoid timeouts
if len(text) > 18000:
text = text[:18000]
warning = "β οΈ Document truncated to ~18k characters for processing.\n\n"
else:
warning = ""
try:
response = summarize_chain.invoke({"text": text})
summary = response.content.strip()
return warning + summary if summary else "No summary generated."
except Exception as e:
err = str(e).lower()
if "token" in err or "authorization" in err:
return "β Hugging Face token invalid or missing."
if "rate limit" in err:
return "β Rate limit reached. Try later."
return f"β Error: {str(e)}"
# Gradio Interface
with gr.Blocks(title="Document Summarizer") as demo:
gr.Markdown("# π Document Summarizer")
gr.Markdown("Upload TXT, PDF, or DOCX and get an AI summary using Qwen2.5-7B-Instruct via Hugging Face.")
file_input = gr.File(
label="Upload Document",
file_types=[".txt", ".pdf", ".docx"],
type="filepath"
)
btn = gr.Button("Generate Summary", variant="primary")
output = gr.Textbox(
label="Summary",
lines=14,
placeholder="Summary will appear here..."
)
btn.click(
fn=summarize_document,
inputs=file_input,
outputs=output
)
gr.Markdown("""
**Notes**:
- Powered by Hugging Face Inference API.
- Free tier has rate limits.
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |