tejovanth commited on
Commit
ac73cea
·
verified ·
1 Parent(s): 681c001

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -121
app.py CHANGED
@@ -1,138 +1,81 @@
1
- import gradio as gr
2
- import fitz # PyMuPDF
3
- import torch
4
- from transformers import pipeline
5
- import time, logging, re
6
- import matplotlib
7
- matplotlib.use('Agg')
8
  import matplotlib.pyplot as plt
 
9
  import io
10
  from PIL import Image
11
 
12
- logging.basicConfig(level=logging.ERROR)
13
-
14
- # Set device (CPU or GPU)
15
- device = 0 if torch.cuda.is_available() else -1
16
- print(f"🔧 Using {'GPU' if device == 0 else 'CPU'}")
17
-
18
- # Load model
19
- try:
20
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
21
- except Exception as e:
22
- print(f"❌ Model loading failed: {str(e)}")
23
- exit(1)
24
-
25
- def visualize_chunk_status(chunk_data):
26
- status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
27
- labels = [f"C{i['chunk']}" for i in chunk_data]
28
- colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
29
- times = [i.get('time', 0.1) for i in chunk_data]
30
-
31
- fig, ax = plt.subplots(figsize=(10, 2.5))
32
- ax.barh(labels, times, color=colors)
33
- ax.set_xlabel("Time (s)")
34
- ax.set_title("📊 Chunk Processing Status")
35
- plt.tight_layout()
36
- buf = io.BytesIO()
37
- plt.savefig(buf, format='png')
38
- buf.seek(0)
39
- plt.close(fig)
40
- return Image.open(buf)
41
-
42
- def create_summary_flowchart(summaries):
43
- filtered = [
44
- s for s in summaries
45
- if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s
46
- ]
47
- if not filtered:
48
- return None
49
-
50
- fig_height = max(2, len(filtered) * 0.8 + 1)
51
- fig, ax = plt.subplots(figsize=(6, fig_height))
52
  ax.axis('off')
53
 
54
- ypos = list(range(len(filtered) * 2, 0, -2))
55
- boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
56
-
57
- for i, (y, summary) in enumerate(zip(ypos, filtered)):
58
- summary_text = summary.split("**Chunk")[1]
59
- summary_text = summary_text.replace("**:", ":").split("\n", 1)[-1].strip()
60
- if len(summary_text) > 120:
61
- summary_text = summary_text[:120] + "..."
62
- ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9)
 
 
 
 
63
 
64
- if i < len(filtered) - 1:
65
- ax.annotate('', xy=(0.5, y - 1.2), xytext=(0.5, y - 0.3),
66
- arrowprops=dict(arrowstyle="->", lw=1.5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
 
 
 
 
 
68
  plt.tight_layout()
 
 
69
  buf = io.BytesIO()
70
  fig.savefig(buf, format='png', bbox_inches='tight')
71
- buf.seek(0)
72
  plt.close(fig)
 
73
  return Image.open(buf)
74
 
75
- def summarize_file(file_bytes):
76
- start = time.time()
77
- chunk_info = []
78
- summaries = []
79
-
80
- try:
81
- doc = fitz.open(stream=file_bytes, filetype="pdf")
82
- text = "".join(page.get_text("text") for page in doc)
83
- text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
84
- text = re.sub(r"\\cap", "intersection", text)
85
- text = re.sub(r"\s+", " ", text).strip()
86
- text = "".join(c for c in text if ord(c) < 128)
87
- except Exception as e:
88
- return f"❌ Text extraction failed: {str(e)}", None, None
89
-
90
- if not text.strip():
91
- return "❌ No text found", None, None
92
-
93
- chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)]
94
- for i, chunk in enumerate(chunks):
95
- chunk_start = time.time()
96
- chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
97
-
98
- if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
99
- summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
100
- chunk_result['status'] = 'skipped'
101
- else:
102
- try:
103
- summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
104
- summaries.append(f"**Chunk {i+1}**:\n{summary}")
105
- chunk_result['status'] = 'summarized'
106
- except Exception as e:
107
- summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
108
- chunk_result['status'] = 'error'
109
-
110
- chunk_result['time'] = time.time() - chunk_start
111
- chunk_info.append(chunk_result)
112
-
113
- final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
114
- process_img = visualize_chunk_status(chunk_info)
115
- flow_img = create_summary_flowchart(summaries)
116
- return final_summary, process_img, flow_img
117
-
118
- demo = gr.Interface(
119
- fn=summarize_file,
120
- inputs=gr.File(label="📄 Upload PDF", type="binary"),
121
- outputs=[
122
- gr.Textbox(label="📝 Summary", lines=20),
123
- gr.Image(label="📊 Chunk Status", type="pil"),
124
- gr.Image(label="🔁 Flow Summary", type="pil")
125
- ],
126
- title="📘 PDF Summarizer with Visual Flow",
127
- description="Summarizes up to 30,000 characters from a PDF. Includes chunk status and flowchart visualizations."
128
- )
129
-
130
- if __name__ == "__main__":
131
- try:
132
- demo.launch(share=False, server_port=7860)
133
- except Exception as e:
134
- print(f"❌ Gradio launch failed: {str(e)}")
135
-
136
 
137
 
138
 
 
 
 
 
 
 
 
 
1
  import matplotlib.pyplot as plt
2
+ from matplotlib.patches import FancyBboxPatch, Circle, FancyArrowPatch
3
  import io
4
  from PIL import Image
5
 
6
+ def create_process_flowchart():
7
+ fig, ax = plt.subplots(figsize=(8, 10))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  ax.axis('off')
9
 
10
+ # Define node positions (y decreases as we move down)
11
+ nodes = [
12
+ ("Start", 9, "circle", "lightgreen"),
13
+ ("Load PDF", 8, "box", "lightblue"),
14
+ ("Extract Text", 7, "box", "lightblue"),
15
+ ("Text Valid?", 6, "diamond", "lightyellow"),
16
+ ("Split into Chunks", 5, "box", "lightblue"),
17
+ ("Process Chunks", 4, "box", "lightblue"),
18
+ ("Chunk Eligible?", 3, "diamond", "lightyellow"),
19
+ ("Summarize Chunk", 2.5, "box", "lightblue"),
20
+ ("Generate Visualizations", 2, "box", "lightblue"),
21
+ ("End", 1, "circle", "lightcoral")
22
+ ]
23
 
24
+ # Draw nodes
25
+ for label, y, shape, color in nodes:
26
+ if shape == "circle":
27
+ node = Circle((0.5, y), 0.4, facecolor=color, edgecolor="black")
28
+ ax.add_patch(node)
29
+ ax.text(0.5, y, label, ha='center', va='center', fontsize=10)
30
+ elif shape == "box":
31
+ node = FancyBboxPatch((0.3, y-0.3), 0.4, 0.6, boxstyle="round,pad=0.3",
32
+ facecolor=color, edgecolor="black")
33
+ ax.add_patch(node)
34
+ ax.text(0.5, y, label, ha='center', va='center', fontsize=10)
35
+ elif shape == "diamond":
36
+ points = [(0.5, y+0.4), (0.7, y), (0.5, y-0.4), (0.3, y)]
37
+ node = plt.Polygon(points, facecolor=color, edgecolor="black")
38
+ ax.add_patch(node)
39
+ ax.text(0.5, y, label, ha='center', va='center', fontsize=10)
40
+
41
+ # Draw arrows
42
+ arrows = [
43
+ (9, 8), # Start -> Load PDF
44
+ (8, 7), # Load PDF -> Extract Text
45
+ (7, 6), # Extract Text -> Text Valid?
46
+ (6, 5), # Text Valid? -> Split into Chunks (Yes)
47
+ (6, 1, 0.7, "No"), # Text Valid? -> End (No)
48
+ (5, 4), # Split into Chunks -> Process Chunks
49
+ (4, 3), # Process Chunks -> Chunk Eligible?
50
+ (3, 2.5), # Chunk Eligible? -> Summarize Chunk (Yes)
51
+ (3, 2, 0.3, "No"), # Chunk Eligible? -> Generate Visualizations (No)
52
+ (2.5, 2), # Summarize Chunk -> Generate Visualizations
53
+ (2, 1) # Generate Visualizations -> End
54
+ ]
55
 
56
+ for start_y, end_y, *extras in arrows:
57
+ x_offset = extras[0] if extras else 0.5
58
+ label = extras[1] if len(extras) > 1 else ""
59
+ arrow = FancyArrowPatch((x_offset, start_y-0.4), (x_offset, end_y+0.4),
60
+ arrowstyle="->", mutation_scale=20, lw=1.5)
61
+ ax.add_patch(arrow)
62
+ if label:
63
+ ax.text(x_offset+0.1, (start_y+end_y)/2, label, fontsize=8, va='center')
64
+
65
+ plt.xlim(0, 1)
66
+ plt.ylim(0, 10)
67
  plt.tight_layout()
68
+
69
+ # Save to buffer
70
  buf = io.BytesIO()
71
  fig.savefig(buf, format='png', bbox_inches='tight')
 
72
  plt.close(fig)
73
+ buf.seek(0)
74
  return Image.open(buf)
75
 
76
+ # Generate and save the flowchart
77
+ flowchart = create_process_flowchart()
78
+ flowchart.save('summary_process_flowchart.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81