Spaces:

omgy
/

mistral

Sleeping

App Files Files Community

omgy commited on Sep 5, 2025

Commit

9b58e0e

verified ·

1 Parent(s): 0430b7d

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -71

app.py CHANGED Viewed

@@ -4,66 +4,57 @@ import fitz  # PyMuPDF
 from fpdf import FPDF
 import os
 import tempfile
 # --- CONFIGURATION ---
-# The model ID for the summarization task.
 MODEL_ID = "sshleifer/distilbart-cnn-12-6"
 API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
-# IMPORTANT: Load the API token from Hugging Face Space's secrets
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
-# --- 1. PDF TEXT EXTRACTION ---
 def extract_text_from_pdf(pdf_file):
-    """
-    Extracts text from an uploaded PDF file object.
-    Gradio passes a temporary file object, not a path.
-    """
     try:
-        # Open the PDF from the file-like object's raw bytes
         doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
         full_text = ""
         for page in doc:
             full_text += page.get_text()
-        return full_text
     except Exception as e:
-        raise gr.Error(f"Failed to read PDF. Is it a valid PDF file? Error: {e}")
-# --- 2. TEXT SUMMARIZATION (THE "TWEAK") ---
 def summarize_text(text_to_summarize):
-    """
-    Sends text to the Hugging Face API for summarization.
-    Includes error handling for API calls.
-    """
     if not HF_API_TOKEN:
-        raise gr.Error("Hugging Face API token is not set. Please add it to the Space's secrets.")
     headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
-    payload = {
-        "inputs": text_to_summarize,
-        "parameters": {
-            "min_length": 50,
-            "max_length": 250,
-            "do_sample": False
         }
-    }
-    response = requests.post(API_URL, headers=headers, json=payload)
-    if response.status_code == 200:
-        summary = response.json()[0]['summary_text']
-        return summary
-    else:
-        # Provide a more user-friendly error message
-        error_details = response.json().get('error', response.text)
-        raise gr.Error(f"Model API Error: {error_details}")
-# --- 3. SAVE THE RESULT TO A NEW PDF ---
 def save_text_to_pdf(text):
-    """
-    Saves the summary text to a new PDF file and returns its path.
-    """
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", "B", 16)
@@ -71,56 +62,46 @@ def save_text_to_pdf(text):
     pdf.ln(10)
     pdf.set_font("Arial", size=12)
-    # Encode text properly to avoid FPDF errors with special characters
-    cleaned_text = text.encode('latin-1', 'replace').decode('latin-1')
     pdf.multi_cell(0, 10, cleaned_text)
-    # Create a temporary file to save the PDF
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
-        pdf.output(tmp_file.name)
-        return tmp_file.name # Return the path to the temporary file
-# --- MAIN WORKFLOW FUNCTION FOR GRADIO ---
 def tweak_pdf_workflow(uploaded_pdf):
-    """
-    The main function that orchestrates the entire process for the Gradio interface.
-    """
     if uploaded_pdf is None:
-        raise gr.Error("Please upload a PDF file first.")
-    gr.Info("Step 1: Extracting text from your PDF...")
-    original_text = extract_text_from_pdf(uploaded_pdf)
-    gr.Info("Step 2: Sending text to the AI model for tweaking...")
     tweaked_text = summarize_text(original_text)
-    gr.Info("Step 3: Creating your new PDF for download...")
     output_pdf_path = save_text_to_pdf(tweaked_text)
     return output_pdf_path
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft()) as iface:
     gr.Markdown(
         """
-        # 📄 PDF Document Tweaker (TLDR)
-        Upload a PDF, and this app will use the `sshleifer/distilbart-cnn-12-6` model
-        to summarize its content and provide a new, tweaked PDF for download.
         """
     )
     with gr.Row():
         pdf_input = gr.File(label="Upload Your PDF", file_types=[".pdf"])
         pdf_output = gr.File(label="Download Tweaked PDF")
     submit_button = gr.Button("Tweak My Document!", variant="primary")
-    submit_button.click(
-        fn=tweak_pdf_workflow,
-        inputs=pdf_input,
-        outputs=pdf_output
-    )
-    gr.Markdown("Created with Gradio and Hugging Face Spaces.")
 if __name__ == "__main__":
-    iface.launch()

 from fpdf import FPDF
 import os
 import tempfile
+import math
 # --- CONFIGURATION ---
 MODEL_ID = "sshleifer/distilbart-cnn-12-6"
 API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
+# --- HELPER FUNCTIONS ---
 def extract_text_from_pdf(pdf_file):
+    """Extract text from uploaded PDF file."""
     try:
         doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
         full_text = ""
         for page in doc:
             full_text += page.get_text()
+        doc.close()
+        return full_text.strip()
     except Exception as e:
+        raise gr.Error(f"Failed to read PDF. Is it valid? Error: {e}")
+def chunk_text(text, max_tokens=1000):
+    """Split text into chunks of approximately max_tokens words."""
+    words = text.split()
+    for i in range(0, len(words), max_tokens):
+        yield " ".join(words[i:i+max_tokens])
 def summarize_text(text_to_summarize):
+    """Send text to Hugging Face API for summarization, chunking if too long."""
     if not HF_API_TOKEN:
+        raise gr.Error("Hugging Face API token is not set. Add it as an environment variable.")
     headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
+    final_summary = []
+    for chunk in chunk_text(text_to_summarize, max_tokens=500):
+        payload = {
+            "inputs": chunk,
+            "parameters": {"min_length": 50, "max_length": 250, "do_sample": False}
         }
+        response = requests.post(API_URL, headers=headers, json=payload)
+        if response.status_code == 200:
+            final_summary.append(response.json()[0]["summary_text"])
+        else:
+            error_details = response.json().get('error', response.text)
+            raise gr.Error(f"Model API Error: {error_details}")
+    return " ".join(final_summary)
 def save_text_to_pdf(text):
+    """Save summarized text to a new PDF and return its path."""
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", "B", 16)
     pdf.ln(10)
     pdf.set_font("Arial", size=12)
+    cleaned_text = text.encode('latin-1', 'replace').decode('latin-1')  # FPDF limitation
     pdf.multi_cell(0, 10, cleaned_text)
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+    pdf.output(tmp_file.name)
+    tmp_file.close()
+    return tmp_file.name
+# --- MAIN WORKFLOW ---
 def tweak_pdf_workflow(uploaded_pdf):
     if uploaded_pdf is None:
+        raise gr.Error("Please upload a PDF first.")
+    # Step 1: Extract
+    original_text = extract_text_from_pdf(uploaded_pdf)
+    if not original_text.strip():
+        raise gr.Error("PDF contains no extractable text.")
+    # Step 2: Summarize
     tweaked_text = summarize_text(original_text)
+    # Step 3: Save
     output_pdf_path = save_text_to_pdf(tweaked_text)
     return output_pdf_path
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft()) as iface:
     gr.Markdown(
         """
+        # 📄 PDF Document Tweaker (TL;DR)
+        Upload a PDF and get a summarized, tweaked PDF using Hugging Face's `distilbart-cnn-12-6`.
         """
     )
     with gr.Row():
         pdf_input = gr.File(label="Upload Your PDF", file_types=[".pdf"])
         pdf_output = gr.File(label="Download Tweaked PDF")
     submit_button = gr.Button("Tweak My Document!", variant="primary")
+    submit_button.click(fn=tweak_pdf_workflow, inputs=pdf_input, outputs=pdf_output)
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)