tejovanth commited on
Commit
681c001
Β·
verified Β·
1 Parent(s): 3f9bd2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -35
app.py CHANGED
@@ -9,28 +9,30 @@ import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
11
 
12
- # Logging setup
13
  logging.basicConfig(level=logging.ERROR)
14
- print("⚠️ Running in CPU-only mode.")
15
 
16
- # Load summarization model
 
 
 
 
17
  try:
18
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)
19
  except Exception as e:
20
- raise RuntimeError(f"Model loading failed: {str(e)}")
 
21
 
22
  def visualize_chunk_status(chunk_data):
23
- colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
24
  labels = [f"C{i['chunk']}" for i in chunk_data]
25
- bar_colors = [colors.get(i['status'], 'gray') for i in chunk_data]
26
  times = [i.get('time', 0.1) for i in chunk_data]
27
 
28
  fig, ax = plt.subplots(figsize=(10, 2.5))
29
- ax.barh(labels, times, color=bar_colors)
30
  ax.set_xlabel("Time (s)")
31
  ax.set_title("πŸ“Š Chunk Processing Status")
32
  plt.tight_layout()
33
-
34
  buf = io.BytesIO()
35
  plt.savefig(buf, format='png')
36
  buf.seek(0)
@@ -38,36 +40,46 @@ def visualize_chunk_status(chunk_data):
38
  return Image.open(buf)
39
 
40
  def create_summary_flowchart(summaries):
41
- blocks = [s for s in summaries if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s]
42
- if not blocks:
 
 
 
43
  return None
44
 
45
- fig, ax = plt.subplots(figsize=(6, len(blocks) * 0.8 + 1))
 
46
  ax.axis('off')
47
- ypos = list(range(len(blocks) * 2, 0, -2))
 
48
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
49
 
50
- for i, (y, summary) in enumerate(zip(ypos, blocks)):
51
- text = summary.split("**Chunk")[-1].replace("**:", ":").split("\n", 1)[-1].strip()[:120]
52
- ax.text(0.5, y, text + ("..." if len(text) > 100 else ""),
53
- ha='center', va='center', bbox=boxprops, fontsize=9)
54
- if i < len(blocks) - 1:
55
- ax.annotate('', xy=(0.5, y - 1), xytext=(0.5, y - 0.2), arrowprops=dict(arrowstyle="->", lw=1.5))
 
 
 
 
56
 
57
- buf = io.BytesIO()
58
  plt.tight_layout()
59
- plt.savefig(buf, format='png')
 
60
  buf.seek(0)
61
  plt.close(fig)
62
  return Image.open(buf)
63
 
64
  def summarize_file(file_bytes):
65
  start = time.time()
66
- chunk_info, summaries = [], []
 
67
 
68
  try:
69
  doc = fitz.open(stream=file_bytes, filetype="pdf")
70
- text = " ".join(page.get_text("text") for page in doc)
71
  text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
72
  text = re.sub(r"\\cap", "intersection", text)
73
  text = re.sub(r"\s+", " ", text).strip()
@@ -75,37 +87,39 @@ def summarize_file(file_bytes):
75
  except Exception as e:
76
  return f"❌ Text extraction failed: {str(e)}", None, None
77
 
78
- if not text:
79
- return "❌ No text found in PDF.", None, None
80
 
81
  chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)]
82
  for i, chunk in enumerate(chunks):
83
- t0 = time.time()
84
- info = {'chunk': i + 1, 'status': '', 'time': 0}
85
 
86
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
87
  summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
88
- info['status'] = 'skipped'
89
  else:
90
  try:
91
  summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
92
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
93
- info['status'] = 'summarized'
94
  except Exception as e:
95
  summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
96
- info['status'] = 'error'
97
 
98
- info['time'] = time.time() - t0
99
- chunk_info.append(info)
100
 
101
  final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
102
- return final_summary, visualize_chunk_status(chunk_info), create_summary_flowchart(summaries)
 
 
103
 
104
  demo = gr.Interface(
105
  fn=summarize_file,
106
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
107
  outputs=[
108
- gr.Textbox(label="πŸ“ Summary"),
109
  gr.Image(label="πŸ“Š Chunk Status", type="pil"),
110
  gr.Image(label="πŸ” Flow Summary", type="pil")
111
  ],
@@ -114,7 +128,10 @@ demo = gr.Interface(
114
  )
115
 
116
  if __name__ == "__main__":
117
- demo.launch(server_port=7860, share=False)
 
 
 
118
 
119
 
120
 
 
9
  import io
10
  from PIL import Image
11
 
 
12
  logging.basicConfig(level=logging.ERROR)
 
13
 
14
+ # Set device (CPU or GPU)
15
+ device = 0 if torch.cuda.is_available() else -1
16
+ print(f"πŸ”§ Using {'GPU' if device == 0 else 'CPU'}")
17
+
18
+ # Load model
19
  try:
20
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
21
  except Exception as e:
22
+ print(f"❌ Model loading failed: {str(e)}")
23
+ exit(1)
24
 
25
  def visualize_chunk_status(chunk_data):
26
+ status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
27
  labels = [f"C{i['chunk']}" for i in chunk_data]
28
+ colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
29
  times = [i.get('time', 0.1) for i in chunk_data]
30
 
31
  fig, ax = plt.subplots(figsize=(10, 2.5))
32
+ ax.barh(labels, times, color=colors)
33
  ax.set_xlabel("Time (s)")
34
  ax.set_title("πŸ“Š Chunk Processing Status")
35
  plt.tight_layout()
 
36
  buf = io.BytesIO()
37
  plt.savefig(buf, format='png')
38
  buf.seek(0)
 
40
  return Image.open(buf)
41
 
42
  def create_summary_flowchart(summaries):
43
+ filtered = [
44
+ s for s in summaries
45
+ if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s
46
+ ]
47
+ if not filtered:
48
  return None
49
 
50
+ fig_height = max(2, len(filtered) * 0.8 + 1)
51
+ fig, ax = plt.subplots(figsize=(6, fig_height))
52
  ax.axis('off')
53
+
54
+ ypos = list(range(len(filtered) * 2, 0, -2))
55
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
56
 
57
+ for i, (y, summary) in enumerate(zip(ypos, filtered)):
58
+ summary_text = summary.split("**Chunk")[1]
59
+ summary_text = summary_text.replace("**:", ":").split("\n", 1)[-1].strip()
60
+ if len(summary_text) > 120:
61
+ summary_text = summary_text[:120] + "..."
62
+ ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9)
63
+
64
+ if i < len(filtered) - 1:
65
+ ax.annotate('', xy=(0.5, y - 1.2), xytext=(0.5, y - 0.3),
66
+ arrowprops=dict(arrowstyle="->", lw=1.5))
67
 
 
68
  plt.tight_layout()
69
+ buf = io.BytesIO()
70
+ fig.savefig(buf, format='png', bbox_inches='tight')
71
  buf.seek(0)
72
  plt.close(fig)
73
  return Image.open(buf)
74
 
75
  def summarize_file(file_bytes):
76
  start = time.time()
77
+ chunk_info = []
78
+ summaries = []
79
 
80
  try:
81
  doc = fitz.open(stream=file_bytes, filetype="pdf")
82
+ text = "".join(page.get_text("text") for page in doc)
83
  text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
84
  text = re.sub(r"\\cap", "intersection", text)
85
  text = re.sub(r"\s+", " ", text).strip()
 
87
  except Exception as e:
88
  return f"❌ Text extraction failed: {str(e)}", None, None
89
 
90
+ if not text.strip():
91
+ return "❌ No text found", None, None
92
 
93
  chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)]
94
  for i, chunk in enumerate(chunks):
95
+ chunk_start = time.time()
96
+ chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
97
 
98
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
99
  summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
100
+ chunk_result['status'] = 'skipped'
101
  else:
102
  try:
103
  summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
104
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
105
+ chunk_result['status'] = 'summarized'
106
  except Exception as e:
107
  summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
108
+ chunk_result['status'] = 'error'
109
 
110
+ chunk_result['time'] = time.time() - chunk_start
111
+ chunk_info.append(chunk_result)
112
 
113
  final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
114
+ process_img = visualize_chunk_status(chunk_info)
115
+ flow_img = create_summary_flowchart(summaries)
116
+ return final_summary, process_img, flow_img
117
 
118
  demo = gr.Interface(
119
  fn=summarize_file,
120
  inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
121
  outputs=[
122
+ gr.Textbox(label="πŸ“ Summary", lines=20),
123
  gr.Image(label="πŸ“Š Chunk Status", type="pil"),
124
  gr.Image(label="πŸ” Flow Summary", type="pil")
125
  ],
 
128
  )
129
 
130
  if __name__ == "__main__":
131
+ try:
132
+ demo.launch(share=False, server_port=7860)
133
+ except Exception as e:
134
+ print(f"❌ Gradio launch failed: {str(e)}")
135
 
136
 
137