omgy commited on
Commit
9b58e0e
·
verified ·
1 Parent(s): 0430b7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -71
app.py CHANGED
@@ -4,66 +4,57 @@ import fitz # PyMuPDF
4
  from fpdf import FPDF
5
  import os
6
  import tempfile
 
7
 
8
  # --- CONFIGURATION ---
9
- # The model ID for the summarization task.
10
  MODEL_ID = "sshleifer/distilbart-cnn-12-6"
11
  API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
12
-
13
- # IMPORTANT: Load the API token from Hugging Face Space's secrets
14
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
15
 
16
- # --- 1. PDF TEXT EXTRACTION ---
 
17
  def extract_text_from_pdf(pdf_file):
18
- """
19
- Extracts text from an uploaded PDF file object.
20
- Gradio passes a temporary file object, not a path.
21
- """
22
  try:
23
- # Open the PDF from the file-like object's raw bytes
24
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
25
  full_text = ""
26
  for page in doc:
27
  full_text += page.get_text()
28
- return full_text
 
29
  except Exception as e:
30
- raise gr.Error(f"Failed to read PDF. Is it a valid PDF file? Error: {e}")
 
 
 
 
 
 
31
 
32
- # --- 2. TEXT SUMMARIZATION (THE "TWEAK") ---
33
  def summarize_text(text_to_summarize):
34
- """
35
- Sends text to the Hugging Face API for summarization.
36
- Includes error handling for API calls.
37
- """
38
  if not HF_API_TOKEN:
39
- raise gr.Error("Hugging Face API token is not set. Please add it to the Space's secrets.")
40
 
41
  headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
42
-
43
- payload = {
44
- "inputs": text_to_summarize,
45
- "parameters": {
46
- "min_length": 50,
47
- "max_length": 250,
48
- "do_sample": False
49
  }
50
- }
 
 
 
 
 
51
 
52
- response = requests.post(API_URL, headers=headers, json=payload)
53
-
54
- if response.status_code == 200:
55
- summary = response.json()[0]['summary_text']
56
- return summary
57
- else:
58
- # Provide a more user-friendly error message
59
- error_details = response.json().get('error', response.text)
60
- raise gr.Error(f"Model API Error: {error_details}")
61
-
62
- # --- 3. SAVE THE RESULT TO A NEW PDF ---
63
  def save_text_to_pdf(text):
64
- """
65
- Saves the summary text to a new PDF file and returns its path.
66
- """
67
  pdf = FPDF()
68
  pdf.add_page()
69
  pdf.set_font("Arial", "B", 16)
@@ -71,56 +62,46 @@ def save_text_to_pdf(text):
71
  pdf.ln(10)
72
 
73
  pdf.set_font("Arial", size=12)
74
- # Encode text properly to avoid FPDF errors with special characters
75
- cleaned_text = text.encode('latin-1', 'replace').decode('latin-1')
76
  pdf.multi_cell(0, 10, cleaned_text)
77
-
78
- # Create a temporary file to save the PDF
79
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
80
- pdf.output(tmp_file.name)
81
- return tmp_file.name # Return the path to the temporary file
82
 
83
- # --- MAIN WORKFLOW FUNCTION FOR GRADIO ---
 
 
 
 
 
 
84
  def tweak_pdf_workflow(uploaded_pdf):
85
- """
86
- The main function that orchestrates the entire process for the Gradio interface.
87
- """
88
  if uploaded_pdf is None:
89
- raise gr.Error("Please upload a PDF file first.")
90
-
91
- gr.Info("Step 1: Extracting text from your PDF...")
92
- original_text = extract_text_from_pdf(uploaded_pdf)
93
 
94
- gr.Info("Step 2: Sending text to the AI model for tweaking...")
 
 
 
 
 
95
  tweaked_text = summarize_text(original_text)
96
-
97
- gr.Info("Step 3: Creating your new PDF for download...")
98
  output_pdf_path = save_text_to_pdf(tweaked_text)
99
-
100
  return output_pdf_path
101
 
102
  # --- GRADIO INTERFACE ---
 
103
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
104
  gr.Markdown(
105
  """
106
- # 📄 PDF Document Tweaker (TLDR)
107
- Upload a PDF, and this app will use the `sshleifer/distilbart-cnn-12-6` model
108
- to summarize its content and provide a new, tweaked PDF for download.
109
  """
110
  )
111
  with gr.Row():
112
  pdf_input = gr.File(label="Upload Your PDF", file_types=[".pdf"])
113
  pdf_output = gr.File(label="Download Tweaked PDF")
114
-
115
  submit_button = gr.Button("Tweak My Document!", variant="primary")
116
-
117
- submit_button.click(
118
- fn=tweak_pdf_workflow,
119
- inputs=pdf_input,
120
- outputs=pdf_output
121
- )
122
-
123
- gr.Markdown("Created with Gradio and Hugging Face Spaces.")
124
 
125
  if __name__ == "__main__":
126
- iface.launch()
 
4
  from fpdf import FPDF
5
  import os
6
  import tempfile
7
+ import math
8
 
9
  # --- CONFIGURATION ---
 
10
  MODEL_ID = "sshleifer/distilbart-cnn-12-6"
11
  API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
 
 
12
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
13
 
14
+ # --- HELPER FUNCTIONS ---
15
+
16
  def extract_text_from_pdf(pdf_file):
17
+ """Extract text from uploaded PDF file."""
 
 
 
18
  try:
 
19
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
20
  full_text = ""
21
  for page in doc:
22
  full_text += page.get_text()
23
+ doc.close()
24
+ return full_text.strip()
25
  except Exception as e:
26
+ raise gr.Error(f"Failed to read PDF. Is it valid? Error: {e}")
27
+
28
+ def chunk_text(text, max_tokens=1000):
29
+ """Split text into chunks of approximately max_tokens words."""
30
+ words = text.split()
31
+ for i in range(0, len(words), max_tokens):
32
+ yield " ".join(words[i:i+max_tokens])
33
 
 
34
  def summarize_text(text_to_summarize):
35
+ """Send text to Hugging Face API for summarization, chunking if too long."""
 
 
 
36
  if not HF_API_TOKEN:
37
+ raise gr.Error("Hugging Face API token is not set. Add it as an environment variable.")
38
 
39
  headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
40
+ final_summary = []
41
+
42
+ for chunk in chunk_text(text_to_summarize, max_tokens=500):
43
+ payload = {
44
+ "inputs": chunk,
45
+ "parameters": {"min_length": 50, "max_length": 250, "do_sample": False}
 
46
  }
47
+ response = requests.post(API_URL, headers=headers, json=payload)
48
+ if response.status_code == 200:
49
+ final_summary.append(response.json()[0]["summary_text"])
50
+ else:
51
+ error_details = response.json().get('error', response.text)
52
+ raise gr.Error(f"Model API Error: {error_details}")
53
 
54
+ return " ".join(final_summary)
55
+
 
 
 
 
 
 
 
 
 
56
  def save_text_to_pdf(text):
57
+ """Save summarized text to a new PDF and return its path."""
 
 
58
  pdf = FPDF()
59
  pdf.add_page()
60
  pdf.set_font("Arial", "B", 16)
 
62
  pdf.ln(10)
63
 
64
  pdf.set_font("Arial", size=12)
65
+ cleaned_text = text.encode('latin-1', 'replace').decode('latin-1') # FPDF limitation
 
66
  pdf.multi_cell(0, 10, cleaned_text)
 
 
 
 
 
67
 
68
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
69
+ pdf.output(tmp_file.name)
70
+ tmp_file.close()
71
+ return tmp_file.name
72
+
73
+ # --- MAIN WORKFLOW ---
74
+
75
  def tweak_pdf_workflow(uploaded_pdf):
 
 
 
76
  if uploaded_pdf is None:
77
+ raise gr.Error("Please upload a PDF first.")
 
 
 
78
 
79
+ # Step 1: Extract
80
+ original_text = extract_text_from_pdf(uploaded_pdf)
81
+ if not original_text.strip():
82
+ raise gr.Error("PDF contains no extractable text.")
83
+
84
+ # Step 2: Summarize
85
  tweaked_text = summarize_text(original_text)
86
+
87
+ # Step 3: Save
88
  output_pdf_path = save_text_to_pdf(tweaked_text)
 
89
  return output_pdf_path
90
 
91
  # --- GRADIO INTERFACE ---
92
+
93
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
94
  gr.Markdown(
95
  """
96
+ # 📄 PDF Document Tweaker (TL;DR)
97
+ Upload a PDF and get a summarized, tweaked PDF using Hugging Face's `distilbart-cnn-12-6`.
 
98
  """
99
  )
100
  with gr.Row():
101
  pdf_input = gr.File(label="Upload Your PDF", file_types=[".pdf"])
102
  pdf_output = gr.File(label="Download Tweaked PDF")
 
103
  submit_button = gr.Button("Tweak My Document!", variant="primary")
104
+ submit_button.click(fn=tweak_pdf_workflow, inputs=pdf_input, outputs=pdf_output)
 
 
 
 
 
 
 
105
 
106
  if __name__ == "__main__":
107
+ iface.launch(server_name="0.0.0.0", server_port=7860)