tejovanth commited on
Commit
97db6a8
Β·
verified Β·
1 Parent(s): fa7e538

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -15
app.py CHANGED
@@ -1,23 +1,153 @@
1
- ```python
2
  import gradio as gr
3
- import logging
 
 
 
 
 
 
 
 
 
4
 
5
- # Existing imports and code (omitted for brevity) remain unchanged until the launch section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  if __name__ == "__main__":
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
  try:
12
  logger.info("Starting Gradio application on http://127.0.0.1:7860")
13
  demo.launch(
14
  share=False,
15
  server_name="127.0.0.1",
16
  server_port=7860,
17
- debug=True # Enable debug mode for detailed error output
18
  )
19
  except Exception as e:
20
- logger.error(f"Gradio launch failed: {str(e)}")
21
  logger.info("Trying alternative port 7861...")
22
  try:
23
  demo.launch(
@@ -29,13 +159,6 @@ if __name__ == "__main__":
29
  except Exception as e2:
30
  logger.error(f"Gradio launch failed on port 7861: {str(e2)}")
31
  raise
32
- ```
33
-
34
- **Changes**:
35
- - Added explicit `server_name="127.0.0.1"` for local binding.
36
- - Enabled `debug=True` for detailed Gradio logs.
37
- - Added a fallback to try port 7861 if 7860 fails.
38
- - Enhanced logging with `logging` module to capture initialization errors.
39
 
40
 
41
 
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import torch
4
+ from transformers import pipeline
5
+ import time, logging, re
6
+ import matplotlib
7
+ matplotlib.use('Agg')
8
+ import matplotlib.pyplot as plt
9
+ import io
10
+ from PIL import Image
11
+ from concurrent.futures import ThreadPoolExecutor
12
 
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Set device (CPU or GPU)
17
+ device = 0 if torch.cuda.is_available() else -1
18
+ logger.info(f"πŸ”§ Using {'GPU' if device == 0 else 'CPU'}")
19
+
20
+ # Load model
21
+ try:
22
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
23
+ except Exception as e:
24
+ logger.error(f"❌ Model loading failed: {str(e)}")
25
+ exit(1)
26
+
27
+ def visualize_chunk_status(chunk_data):
28
+ status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'}
29
+ labels = [f"C{i['chunk']}" for i in chunk_data]
30
+ colors = [status_colors.get(i['status'], 'gray') for i in chunk_data]
31
+ times = [i.get('time', 0.1) for i in chunk_data]
32
+
33
+ fig, ax = plt.subplots(figsize=(10, 2.5))
34
+ ax.barh(labels, times, color=colors)
35
+ ax.set_xlabel("Time (s)")
36
+ ax.set_title("πŸ“Š Chunk Processing Status")
37
+ plt.tight_layout()
38
+ buf = io.BytesIO()
39
+ plt.savefig(buf, format='png')
40
+ plt.close(fig)
41
+ buf.seek(0)
42
+ return Image.open(buf)
43
+
44
+ def create_summary_flowchart(summaries):
45
+ filtered = [
46
+ s for s in summaries
47
+ if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s
48
+ ]
49
+ if not filtered:
50
+ return None
51
+
52
+ fig_height = max(2, len(filtered) * 0.8 + 1)
53
+ fig, ax = plt.subplots(figsize=(6, fig_height))
54
+ ax.axis('off')
55
+
56
+ ypos = list(range(len(filtered) * 2, 0, -2))
57
+ boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black")
58
+
59
+ for i, (y, summary) in enumerate(zip(ypos, filtered)):
60
+ summary_text = summary.split("**Chunk")[1]
61
+ summary_text = summary_text.replace("**:", ":").split("\n", 1)[-1].strip()
62
+ if len(summary_text) > 120:
63
+ summary_text = summary_text[:120] + "..."
64
+ ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9)
65
+
66
+ if i < len(filtered) - 1:
67
+ ax.annotate('', xy=(0.5, y - 1.2), xytext=(0.5, y - 0.3),
68
+ arrowprops=dict(arrowstyle="->", lw=1.5))
69
+
70
+ plt.tight_layout()
71
+ buf = io.BytesIO()
72
+ fig.savefig(buf, format='png', bbox_inches='tight')
73
+ plt.close(fig)
74
+ buf.seek(0)
75
+ return Image.open(buf)
76
+
77
+ def process_chunk(i, chunk):
78
+ chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
79
+ start_time = time.time()
80
+
81
+ if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
82
+ result = f"**Chunk {i+1}**: Skipped (equation-heavy)"
83
+ chunk_result['status'] = 'skipped'
84
+ else:
85
+ try:
86
+ summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text']
87
+ result = f"**Chunk {i+1}**:\n{summary}"
88
+ chunk_result['status'] = 'summarized'
89
+ except Exception as e:
90
+ result = f"**Chunk {i+1}**: ❌ Error: {str(e)}"
91
+ chunk_result['status'] = 'error'
92
+
93
+ chunk_result['time'] = time.time() - start_time
94
+ return result, chunk_result
95
+
96
+ def summarize_file(file_bytes):
97
+ start = time.time()
98
+ summaries = []
99
+ chunk_info = []
100
+
101
+ try:
102
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
103
+ text = "".join(page.get_text("text") for page in doc)
104
+ text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
105
+ text = re.sub(r"\\cap", "intersection", text)
106
+ text = re.sub(r"\s+", " ", text).strip()
107
+ text = "".join(c for c in text if ord(c) < 128)
108
+ except Exception as e:
109
+ return f"❌ Text extraction failed: {str(e)}", None, None
110
+
111
+ if not text.strip():
112
+ return "❌ No text found", None, None
113
+
114
+ chunks = [text[i:i+1500] for i in range(0, min(len(text), 30000), 1500)]
115
+
116
+ with ThreadPoolExecutor(max_workers=4) as executor:
117
+ results = list(executor.map(lambda ic: process_chunk(*ic), enumerate(chunks)))
118
+
119
+ for summary, info in results:
120
+ summaries.append(summary)
121
+ chunk_info.append(info)
122
+
123
+ final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries)
124
+ process_img = visualize_chunk_status(chunk_info)
125
+ flow_img = create_summary_flowchart(summaries)
126
+ return final_summary, process_img, flow_img
127
+
128
+ demo = gr.Interface(
129
+ fn=summarize_file,
130
+ inputs=gr.File(label="πŸ“„ Upload PDF", type="binary"),
131
+ outputs=[
132
+ gr.Textbox(label="πŸ“ Summary", lines=20),
133
+ gr.Image(label="πŸ“Š Chunk Status", type="pil"),
134
+ gr.Image(label="πŸ” Flow Summary", type="pil")
135
+ ],
136
+ title="πŸ“˜ PDF Summarizer with Visual Flow",
137
+ description="Summarizes up to 30,000 characters from a PDF. Includes chunk status and flowchart visualizations."
138
+ )
139
 
140
  if __name__ == "__main__":
 
 
 
141
  try:
142
  logger.info("Starting Gradio application on http://127.0.0.1:7860")
143
  demo.launch(
144
  share=False,
145
  server_name="127.0.0.1",
146
  server_port=7860,
147
+ debug=True
148
  )
149
  except Exception as e:
150
+ logger.error(f"Gradio launch failed on port 7860: {str(e)}")
151
  logger.info("Trying alternative port 7861...")
152
  try:
153
  demo.launch(
 
159
  except Exception as e2:
160
  logger.error(f"Gradio launch failed on port 7861: {str(e2)}")
161
  raise
 
 
 
 
 
 
 
162
 
163
 
164