tejovanth commited on
Commit
3f9bd2c
·
verified ·
1 Parent(s): 313738c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -75
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- import fitz
3
  import torch
4
  from transformers import pipeline
5
  import time, logging, re
@@ -8,33 +8,29 @@ matplotlib.use('Agg')
8
  import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
11
- import nltk
12
-
13
- # Download punkt tokenizer if not already
14
- nltk.download('punkt', quiet=True)
15
- from nltk.tokenize import sent_tokenize
16
 
 
17
  logging.basicConfig(level=logging.ERROR)
18
- device = -1 # CPU-only
19
- print("⚠️ Optimized CPU-only version.")
20
 
 
21
  try:
22
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
23
  except Exception as e:
24
- print(f"Model loading failed: {str(e)}")
25
- exit(1)
26
 
27
  def visualize_chunk_status(chunk_data):
28
- status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
29
  labels = [f"C{i['chunk']}" for i in chunk_data]
30
- colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
31
  times = [i.get('time', 0.1) for i in chunk_data]
32
 
33
  fig, ax = plt.subplots(figsize=(10, 2.5))
34
- ax.barh(labels, times, color=colors)
35
  ax.set_xlabel("Time (s)")
36
  ax.set_title("📊 Chunk Processing Status")
37
  plt.tight_layout()
 
38
  buf = io.BytesIO()
39
  plt.savefig(buf, format='png')
40
  buf.seek(0)
@@ -42,34 +38,21 @@ def visualize_chunk_status(chunk_data):
42
  return Image.open(buf)
43
 
44
  def create_summary_flowchart(summaries):
45
- # Filter only successful summaries
46
- filtered = []
47
- for s in summaries:
48
- if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s:
49
- parts = s.split("**:", 1)
50
- if len(parts) > 1:
51
- filtered.append(parts[1].strip())
52
-
53
- if not filtered:
54
  return None
55
 
56
- fig_height = max(2, len(filtered) * 1.5)
57
- fig, ax = plt.subplots(figsize=(6, fig_height))
58
  ax.axis('off')
59
-
60
- ypos = list(range(len(filtered) * 2, 0, -2))
61
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
62
 
63
- for i, (y, summary) in enumerate(zip(ypos, filtered)):
64
- summary_text = summary.replace("\n", " ").strip()[:120]
65
- if len(summary_text) == 120:
66
- summary_text += "..."
67
-
68
- ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9)
69
-
70
- if i < len(filtered) - 1:
71
- ax.annotate('', xy=(0.5, y - 1.5), xytext=(0.5, y - 0.5),
72
- arrowprops=dict(arrowstyle="->", lw=1.5))
73
 
74
  buf = io.BytesIO()
75
  plt.tight_layout()
@@ -78,31 +61,13 @@ def create_summary_flowchart(summaries):
78
  plt.close(fig)
79
  return Image.open(buf)
80
 
81
- def split_text_into_chunks(text, max_tokens=1500):
82
- sentences = sent_tokenize(text)
83
- chunks = []
84
- current_chunk = ""
85
-
86
- for sentence in sentences:
87
- if len(current_chunk) + len(sentence) <= max_tokens:
88
- current_chunk += " " + sentence
89
- else:
90
- chunks.append(current_chunk.strip())
91
- current_chunk = sentence
92
-
93
- if current_chunk:
94
- chunks.append(current_chunk.strip())
95
-
96
- return chunks[:20] # Limit to 20 chunks max
97
-
98
  def summarize_file(file_bytes):
99
  start = time.time()
100
- chunk_info = []
101
- summaries = []
102
 
103
  try:
104
  doc = fitz.open(stream=file_bytes, filetype="pdf")
105
- text = "".join(page.get_text("text") for page in doc)
106
  text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
107
  text = re.sub(r"\\cap", "intersection", text)
108
  text = re.sub(r"\s+", " ", text).strip()
@@ -110,34 +75,31 @@ def summarize_file(file_bytes):
110
  except Exception as e:
111
  return f"❌ Text extraction failed: {str(e)}", None, None
112
 
113
- if not text.strip():
114
- return "❌ No text found", None, None
115
-
116
- chunks = split_text_into_chunks(text)
117
 
 
118
  for i, chunk in enumerate(chunks):
119
- chunk_start = time.time()
120
- chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
121
 
122
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
123
  summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
124
- chunk_result['status'] = 'skipped'
125
  else:
126
  try:
127
  summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
128
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
129
- chunk_result['status'] = 'summarized'
130
  except Exception as e:
131
  summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
132
- chunk_result['status'] = 'error'
133
 
134
- chunk_result['time'] = time.time() - chunk_start
135
- chunk_info.append(chunk_result)
136
 
137
  final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
138
- process_img = visualize_chunk_status(chunk_info)
139
- flow_img = create_summary_flowchart(summaries)
140
- return final_summary, process_img, flow_img
141
 
142
  demo = gr.Interface(
143
  fn=summarize_file,
@@ -152,10 +114,8 @@ demo = gr.Interface(
152
  )
153
 
154
  if __name__ == "__main__":
155
- try:
156
- demo.launch(share=False, server_port=7860)
157
- except Exception as e:
158
- print(f"❌ Gradio launch failed: {str(e)}")
159
 
160
 
161
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
  import torch
4
  from transformers import pipeline
5
  import time, logging, re
 
8
  import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
 
 
 
 
 
11
 
12
+ # Logging setup
13
  logging.basicConfig(level=logging.ERROR)
14
+ print("⚠️ Running in CPU-only mode.")
 
15
 
16
+ # Load summarization model
17
  try:
18
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)
19
  except Exception as e:
20
+ raise RuntimeError(f"Model loading failed: {str(e)}")
 
21
 
22
  def visualize_chunk_status(chunk_data):
23
+ colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
24
  labels = [f"C{i['chunk']}" for i in chunk_data]
25
+ bar_colors = [colors.get(i['status'], 'gray') for i in chunk_data]
26
  times = [i.get('time', 0.1) for i in chunk_data]
27
 
28
  fig, ax = plt.subplots(figsize=(10, 2.5))
29
+ ax.barh(labels, times, color=bar_colors)
30
  ax.set_xlabel("Time (s)")
31
  ax.set_title("📊 Chunk Processing Status")
32
  plt.tight_layout()
33
+
34
  buf = io.BytesIO()
35
  plt.savefig(buf, format='png')
36
  buf.seek(0)
 
38
  return Image.open(buf)
39
 
40
  def create_summary_flowchart(summaries):
41
+ blocks = [s for s in summaries if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s]
42
+ if not blocks:
 
 
 
 
 
 
 
43
  return None
44
 
45
+ fig, ax = plt.subplots(figsize=(6, len(blocks) * 0.8 + 1))
 
46
  ax.axis('off')
47
+ ypos = list(range(len(blocks) * 2, 0, -2))
 
48
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
49
 
50
+ for i, (y, summary) in enumerate(zip(ypos, blocks)):
51
+ text = summary.split("**Chunk")[-1].replace("**:", ":").split("\n", 1)[-1].strip()[:120]
52
+ ax.text(0.5, y, text + ("..." if len(text) > 100 else ""),
53
+ ha='center', va='center', bbox=boxprops, fontsize=9)
54
+ if i < len(blocks) - 1:
55
+ ax.annotate('', xy=(0.5, y - 1), xytext=(0.5, y - 0.2), arrowprops=dict(arrowstyle="->", lw=1.5))
 
 
 
 
56
 
57
  buf = io.BytesIO()
58
  plt.tight_layout()
 
61
  plt.close(fig)
62
  return Image.open(buf)
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def summarize_file(file_bytes):
65
  start = time.time()
66
+ chunk_info, summaries = [], []
 
67
 
68
  try:
69
  doc = fitz.open(stream=file_bytes, filetype="pdf")
70
+ text = " ".join(page.get_text("text") for page in doc)
71
  text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
72
  text = re.sub(r"\\cap", "intersection", text)
73
  text = re.sub(r"\s+", " ", text).strip()
 
75
  except Exception as e:
76
  return f"❌ Text extraction failed: {str(e)}", None, None
77
 
78
+ if not text:
79
+ return "❌ No text found in PDF.", None, None
 
 
80
 
81
+ chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)]
82
  for i, chunk in enumerate(chunks):
83
+ t0 = time.time()
84
+ info = {'chunk': i + 1, 'status': '', 'time': 0}
85
 
86
  if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
87
  summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
88
+ info['status'] = 'skipped'
89
  else:
90
  try:
91
  summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
92
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
93
+ info['status'] = 'summarized'
94
  except Exception as e:
95
  summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
96
+ info['status'] = 'error'
97
 
98
+ info['time'] = time.time() - t0
99
+ chunk_info.append(info)
100
 
101
  final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
102
+ return final_summary, visualize_chunk_status(chunk_info), create_summary_flowchart(summaries)
 
 
103
 
104
  demo = gr.Interface(
105
  fn=summarize_file,
 
114
  )
115
 
116
  if __name__ == "__main__":
117
+ demo.launch(server_port=7860, share=False)
118
+
 
 
119
 
120
 
121