tejovanth commited on
Commit
ba5944b
Β·
verified Β·
1 Parent(s): eedb41d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -25
app.py CHANGED
@@ -11,10 +11,10 @@ from PIL import Image
11
 
12
  logging.basicConfig(level=logging.ERROR)
13
  device = -1 # CPU-only
14
- print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
15
 
16
  try:
17
- summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
18
  except Exception as e:
19
  print(f"❌ Model loading failed: {str(e)}")
20
  exit(1)
@@ -30,7 +30,6 @@ def visualize_chunk_status(chunk_data):
30
  ax.set_xlabel("Time (s)")
31
  ax.set_title("πŸ“Š Chunk Processing Status")
32
  plt.tight_layout()
33
-
34
  buf = io.BytesIO()
35
  plt.savefig(buf, format='png')
36
  buf.seek(0)
@@ -38,18 +37,23 @@ def visualize_chunk_status(chunk_data):
38
  return Image.open(buf)
39
 
40
  def create_summary_flowchart(summaries):
41
- fig, ax = plt.subplots(figsize=(6, len(summaries) * 0.8 + 1))
 
 
 
 
42
  ax.axis('off')
43
-
44
- ypos = list(range(len(summaries) * 2, 0, -2))
45
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
46
 
47
- for i, (y, summary) in enumerate(zip(ypos, summaries)):
48
- summary_text = summary.split("**Chunk")[1] if summary.startswith("**Chunk") else summary
49
- summary_text = summary_text.strip().replace("**:", ":")[:120] + ("..." if len(summary) > 120 else "")
50
- ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9, wrap=True)
 
51
 
52
- if i < len(summaries) - 1:
53
  ax.annotate('', xy=(0.5, y - 1), xytext=(0.5, y - 0.2),
54
  arrowprops=dict(arrowstyle="->", lw=1.5))
55
 
@@ -63,6 +67,7 @@ def create_summary_flowchart(summaries):
63
  def summarize_file(file_bytes):
64
  start = time.time()
65
  chunk_info = []
 
66
 
67
  try:
68
  doc = fitz.open(stream=file_bytes, filetype="pdf")
@@ -77,24 +82,17 @@ def summarize_file(file_bytes):
77
  if not text.strip():
78
  return "❌ No text found", None, None
79
 
80
- text = text[:300000]
81
- chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
82
- summaries = []
83
-
84
  for i, chunk in enumerate(chunks):
85
  chunk_start = time.time()
86
  chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
87
 
88
- if time.time() - start > 20:
89
- summaries.append("⚠️ Stopped early")
90
- break
91
-
92
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
93
  summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
94
  chunk_result['status'] = 'skipped'
95
  else:
96
  try:
97
- summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
98
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
99
  chunk_result['status'] = 'summarized'
100
  except Exception as e:
@@ -104,7 +102,7 @@ def summarize_file(file_bytes):
104
  chunk_result['time'] = time.time() - chunk_start
105
  chunk_info.append(chunk_result)
106
 
107
- final_summary = f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
108
  process_img = visualize_chunk_status(chunk_info)
109
  flow_img = create_summary_flowchart(summaries)
110
  return final_summary, process_img, flow_img
@@ -113,12 +111,12 @@ demo = gr.Interface(
113
  fn=summarize_file,
114
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
115
  outputs=[
116
- gr.Textbox(label="πŸ“ Summarized Output"),
117
  gr.Image(label="πŸ“Š Chunk Status", type="pil"),
118
- gr.Image(label="πŸ” Flowchart Summary", type="pil")
119
  ],
120
- title="AI-Powered PDF Summarizer",
121
- description="Summarizes long PDFs (up to 300,000 characters) and visualizes chunk processing + flow of content."
122
  )
123
 
124
  if __name__ == "__main__":
@@ -128,3 +126,4 @@ if __name__ == "__main__":
128
  print(f"❌ Gradio launch failed: {str(e)}")
129
 
130
 
 
 
11
 
12
  logging.basicConfig(level=logging.ERROR)
13
  device = -1 # CPU-only
14
+ print("⚠️ Optimized CPU-only version.")
15
 
16
  try:
17
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
18
  except Exception as e:
19
  print(f"❌ Model loading failed: {str(e)}")
20
  exit(1)
 
30
  ax.set_xlabel("Time (s)")
31
  ax.set_title("πŸ“Š Chunk Processing Status")
32
  plt.tight_layout()
 
33
  buf = io.BytesIO()
34
  plt.savefig(buf, format='png')
35
  buf.seek(0)
 
37
  return Image.open(buf)
38
 
39
  def create_summary_flowchart(summaries):
40
+ filtered = [s for s in summaries if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s]
41
+ if not filtered:
42
+ return None
43
+
44
+ fig, ax = plt.subplots(figsize=(6, len(filtered) * 0.8 + 1))
45
  ax.axis('off')
46
+
47
+ ypos = list(range(len(filtered) * 2, 0, -2))
48
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
49
 
50
+ for i, (y, summary) in enumerate(zip(ypos, filtered)):
51
+ summary_text = summary.split("**Chunk")[1]
52
+ summary_text = summary_text.replace("**:", ":").split("\n", 1)[-1].strip()[:120]
53
+ ax.text(0.5, y, summary_text + ("..." if len(summary_text) > 100 else ""),
54
+ ha='center', va='center', bbox=boxprops, fontsize=9)
55
 
56
+ if i < len(filtered) - 1:
57
  ax.annotate('', xy=(0.5, y - 1), xytext=(0.5, y - 0.2),
58
  arrowprops=dict(arrowstyle="->", lw=1.5))
59
 
 
67
  def summarize_file(file_bytes):
68
  start = time.time()
69
  chunk_info = []
70
+ summaries = []
71
 
72
  try:
73
  doc = fitz.open(stream=file_bytes, filetype="pdf")
 
82
  if not text.strip():
83
  return "❌ No text found", None, None
84
 
85
+ chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)] # max 20 chunks
 
 
 
86
  for i, chunk in enumerate(chunks):
87
  chunk_start = time.time()
88
  chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
89
 
 
 
 
 
90
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
91
  summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
92
  chunk_result['status'] = 'skipped'
93
  else:
94
  try:
95
+ summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
96
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
97
  chunk_result['status'] = 'summarized'
98
  except Exception as e:
 
102
  chunk_result['time'] = time.time() - chunk_start
103
  chunk_info.append(chunk_result)
104
 
105
+ final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
106
  process_img = visualize_chunk_status(chunk_info)
107
  flow_img = create_summary_flowchart(summaries)
108
  return final_summary, process_img, flow_img
 
111
  fn=summarize_file,
112
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
113
  outputs=[
114
+ gr.Textbox(label="πŸ“ Summary"),
115
  gr.Image(label="πŸ“Š Chunk Status", type="pil"),
116
+ gr.Image(label="πŸ” Flow Summary", type="pil")
117
  ],
118
+ title="πŸ“˜ PDF Summarizer with Visual Flow",
119
+ description="Summarizes up to 30,000 characters from a PDF. Includes chunk status and flowchart visualizations."
120
  )
121
 
122
  if __name__ == "__main__":
 
126
  print(f"❌ Gradio launch failed: {str(e)}")
127
 
128
 
129
+