tejovanth commited on
Commit
2d57a07
·
verified ·
1 Parent(s): ba5944b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -8
app.py CHANGED
@@ -8,6 +8,11 @@ matplotlib.use('Agg')
8
  import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
 
 
 
 
 
11
 
12
  logging.basicConfig(level=logging.ERROR)
13
  device = -1 # CPU-only
@@ -37,24 +42,33 @@ def visualize_chunk_status(chunk_data):
37
  return Image.open(buf)
38
 
39
  def create_summary_flowchart(summaries):
40
- filtered = [s for s in summaries if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s]
 
 
 
 
 
 
 
41
  if not filtered:
42
  return None
43
 
44
- fig, ax = plt.subplots(figsize=(6, len(filtered) * 0.8 + 1))
 
45
  ax.axis('off')
46
 
47
  ypos = list(range(len(filtered) * 2, 0, -2))
48
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
49
 
50
  for i, (y, summary) in enumerate(zip(ypos, filtered)):
51
- summary_text = summary.split("**Chunk")[1]
52
- summary_text = summary_text.replace("**:", ":").split("\n", 1)[-1].strip()[:120]
53
- ax.text(0.5, y, summary_text + ("..." if len(summary_text) > 100 else ""),
54
- ha='center', va='center', bbox=boxprops, fontsize=9)
 
55
 
56
  if i < len(filtered) - 1:
57
- ax.annotate('', xy=(0.5, y - 1), xytext=(0.5, y - 0.2),
58
  arrowprops=dict(arrowstyle="->", lw=1.5))
59
 
60
  buf = io.BytesIO()
@@ -64,6 +78,23 @@ def create_summary_flowchart(summaries):
64
  plt.close(fig)
65
  return Image.open(buf)
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def summarize_file(file_bytes):
68
  start = time.time()
69
  chunk_info = []
@@ -82,7 +113,8 @@ def summarize_file(file_bytes):
82
  if not text.strip():
83
  return "❌ No text found", None, None
84
 
85
- chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)] # max 20 chunks
 
86
  for i, chunk in enumerate(chunks):
87
  chunk_start = time.time()
88
  chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
@@ -127,3 +159,4 @@ if __name__ == "__main__":
127
 
128
 
129
 
 
 
8
  import matplotlib.pyplot as plt
9
  import io
10
  from PIL import Image
11
+ import nltk
12
+
13
+ # Download punkt tokenizer if not already
14
+ nltk.download('punkt', quiet=True)
15
+ from nltk.tokenize import sent_tokenize
16
 
17
  logging.basicConfig(level=logging.ERROR)
18
  device = -1 # CPU-only
 
42
  return Image.open(buf)
43
 
44
  def create_summary_flowchart(summaries):
45
+ # Filter only successful summaries
46
+ filtered = []
47
+ for s in summaries:
48
+ if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s:
49
+ parts = s.split("**:", 1)
50
+ if len(parts) > 1:
51
+ filtered.append(parts[1].strip())
52
+
53
  if not filtered:
54
  return None
55
 
56
+ fig_height = max(2, len(filtered) * 1.5)
57
+ fig, ax = plt.subplots(figsize=(6, fig_height))
58
  ax.axis('off')
59
 
60
  ypos = list(range(len(filtered) * 2, 0, -2))
61
  boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
62
 
63
  for i, (y, summary) in enumerate(zip(ypos, filtered)):
64
+ summary_text = summary.replace("\n", " ").strip()[:120]
65
+ if len(summary_text) == 120:
66
+ summary_text += "..."
67
+
68
+ ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9)
69
 
70
  if i < len(filtered) - 1:
71
+ ax.annotate('', xy=(0.5, y - 1.5), xytext=(0.5, y - 0.5),
72
  arrowprops=dict(arrowstyle="->", lw=1.5))
73
 
74
  buf = io.BytesIO()
 
78
  plt.close(fig)
79
  return Image.open(buf)
80
 
81
+ def split_text_into_chunks(text, max_tokens=1500):
82
+ sentences = sent_tokenize(text)
83
+ chunks = []
84
+ current_chunk = ""
85
+
86
+ for sentence in sentences:
87
+ if len(current_chunk) + len(sentence) <= max_tokens:
88
+ current_chunk += " " + sentence
89
+ else:
90
+ chunks.append(current_chunk.strip())
91
+ current_chunk = sentence
92
+
93
+ if current_chunk:
94
+ chunks.append(current_chunk.strip())
95
+
96
+ return chunks[:20] # Limit to 20 chunks max
97
+
98
  def summarize_file(file_bytes):
99
  start = time.time()
100
  chunk_info = []
 
113
  if not text.strip():
114
  return "❌ No text found", None, None
115
 
116
+ chunks = split_text_into_chunks(text)
117
+
118
  for i, chunk in enumerate(chunks):
119
  chunk_start = time.time()
120
  chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
 
159
 
160
 
161
 
162
+