tejovanth commited on
Commit
d65c22a
Β·
verified Β·
1 Parent(s): bbd6510

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -5,21 +5,20 @@ from transformers import pipeline
5
  import time, logging
6
  import re
7
  import tempfile
8
- import os
9
 
10
- # === Setup ===
11
  logging.basicConfig(level=logging.ERROR)
12
  device = -1 # CPU
13
  print("⚠️ CPU-only mode. Expect ~20–30s for large documents.")
14
 
15
- # === Load summarization model ===
16
  try:
17
  summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
18
  except Exception as e:
19
  print(f"❌ Model loading failed: {e}")
20
  exit(1)
21
 
22
- # === Text Preprocessing ===
23
  def smart_chunk(text, max_chunk_len=2000):
24
  sentences = re.split(r'(?<=[.!?]) +', text)
25
  chunks, current_chunk = [], ""
@@ -33,7 +32,7 @@ def smart_chunk(text, max_chunk_len=2000):
33
  chunks.append(current_chunk.strip())
34
  return chunks
35
 
36
- # === Summarization per file ===
37
  def summarize_file_bytes(file_bytes, filename):
38
  start_time = time.time()
39
  try:
@@ -48,7 +47,7 @@ def summarize_file_bytes(file_bytes, filename):
48
  if not text:
49
  return f"{filename}: ❌ No text found.", ""
50
 
51
- text = text[:300000]
52
  chunks = smart_chunk(text)
53
  summaries, line_count = [], 0
54
 
@@ -69,19 +68,19 @@ def summarize_file_bytes(file_bytes, filename):
69
  summary_text = f"πŸ“„ **{filename}**\n**Characters**: {len(text)} | **Time**: {total_time:.2f}s\n\n" + "\n\n".join(summaries)
70
  return summary_text, summary_text
71
 
72
- # === Gradio Wrapper ===
73
- def summarize_multiple_files(files):
74
  all_summaries = []
75
  combined_text = ""
76
 
77
- for file_obj in files:
78
  file_bytes = file_obj.read()
79
  filename = file_obj.name.split("/")[-1]
80
- summary, raw = summarize_file_bytes(file_bytes, filename)
81
  all_summaries.append(summary)
82
- combined_text += f"\n\n{raw}\n" + "="*60 + "\n"
83
 
84
- # Write summary to temp .txt file
85
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
86
  f.write(combined_text)
87
  summary_file_path = f.name
@@ -91,7 +90,7 @@ def summarize_multiple_files(files):
91
  # === Gradio Interface ===
92
  demo = gr.Interface(
93
  fn=summarize_multiple_files,
94
- inputs=gr.File(label="πŸ“„ Upload PDF or TXT files", file_types=[".pdf", ".txt"], type="file", file_count="multiple"),
95
  outputs=[
96
  gr.Textbox(label="πŸ“ Summary", lines=30, max_lines=100),
97
  gr.File(label="πŸ“₯ Download Summary as .txt")
@@ -100,6 +99,7 @@ demo = gr.Interface(
100
  description="Summarizes multiple PDFs or TXTs into at least 15 lines each. Download final output as .txt. CPU-optimized."
101
  )
102
 
 
103
  if __name__ == "__main__":
104
  try:
105
  demo.launch(share=False, server_port=7860)
@@ -110,3 +110,4 @@ if __name__ == "__main__":
110
 
111
 
112
 
 
 
5
  import time, logging
6
  import re
7
  import tempfile
 
8
 
9
+ # === Setup Logging and Device ===
10
  logging.basicConfig(level=logging.ERROR)
11
  device = -1 # CPU
12
  print("⚠️ CPU-only mode. Expect ~20–30s for large documents.")
13
 
14
+ # === Load the Summarization Model ===
15
  try:
16
  summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
17
  except Exception as e:
18
  print(f"❌ Model loading failed: {e}")
19
  exit(1)
20
 
21
+ # === Sentence-Smart Chunking ===
22
  def smart_chunk(text, max_chunk_len=2000):
23
  sentences = re.split(r'(?<=[.!?]) +', text)
24
  chunks, current_chunk = [], ""
 
32
  chunks.append(current_chunk.strip())
33
  return chunks
34
 
35
+ # === Summarization for a Single File ===
36
  def summarize_file_bytes(file_bytes, filename):
37
  start_time = time.time()
38
  try:
 
47
  if not text:
48
  return f"{filename}: ❌ No text found.", ""
49
 
50
+ text = text[:300000] # Trim to model-safe size
51
  chunks = smart_chunk(text)
52
  summaries, line_count = [], 0
53
 
 
68
  summary_text = f"πŸ“„ **{filename}**\n**Characters**: {len(text)} | **Time**: {total_time:.2f}s\n\n" + "\n\n".join(summaries)
69
  return summary_text, summary_text
70
 
71
+ # === Function for Multiple Files ===
72
+ def summarize_multiple_files(file_objs):
73
  all_summaries = []
74
  combined_text = ""
75
 
76
+ for file_obj in file_objs:
77
  file_bytes = file_obj.read()
78
  filename = file_obj.name.split("/")[-1]
79
+ summary, raw_text = summarize_file_bytes(file_bytes, filename)
80
  all_summaries.append(summary)
81
+ combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n"
82
 
83
+ # Save combined summary to a temp .txt file
84
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f:
85
  f.write(combined_text)
86
  summary_file_path = f.name
 
90
  # === Gradio Interface ===
91
  demo = gr.Interface(
92
  fn=summarize_multiple_files,
93
+ inputs=gr.File(label="πŸ“„ Upload PDF or TXT files", file_types=[".pdf", ".txt"], type="binary", file_count="multiple"),
94
  outputs=[
95
  gr.Textbox(label="πŸ“ Summary", lines=30, max_lines=100),
96
  gr.File(label="πŸ“₯ Download Summary as .txt")
 
99
  description="Summarizes multiple PDFs or TXTs into at least 15 lines each. Download final output as .txt. CPU-optimized."
100
  )
101
 
102
+ # === Run the App ===
103
  if __name__ == "__main__":
104
  try:
105
  demo.launch(share=False, server_port=7860)
 
110
 
111
 
112
 
113
+