Ilia Tambovtsev commited on
Commit
43554ac
·
1 Parent(s): f7f521b

feat: add gradio web app

Browse files
pyproject.toml CHANGED
@@ -20,6 +20,9 @@ langchain-openai = "^0.2.3"
20
  matplotlib = "^3.9.2"
21
  pandas = "^2.2.3"
22
  chromadb = "^0.5.20"
 
 
 
23
 
24
 
25
  [build-system]
 
20
  matplotlib = "^3.9.2"
21
  pandas = "^2.2.3"
22
  chromadb = "^0.5.20"
23
+ gradio = "^5.6.0"
24
+ gradio-pdf = "^0.0.19"
25
+ tabulate = "^0.9.0"
26
 
27
 
28
  [build-system]
src/config/output_formatting.py CHANGED
@@ -341,7 +341,7 @@ def display_search_result_page(
341
  print("\nChunk distances:")
342
  print("-" * 80)
343
  for chunk_type, distance in result.chunk_distances.items():
344
- status = f"distance: {distance:.3f}" if distance is not None else "not matched"
345
  print(f"{chunk_type}: {status}")
346
 
347
  # Display all chunks content
 
341
  print("\nChunk distances:")
342
  print("-" * 80)
343
  for chunk_type, distance in result.chunk_distances.items():
344
+ status = f"{distance:.3f}" if distance is not None else "not matched"
345
  print(f"{chunk_type}: {status}")
346
 
347
  # Display all chunks content
src/rag/storage.py CHANGED
@@ -57,6 +57,10 @@ class ScoredChunk(BaseModel):
57
  """Get chunk type from metadata"""
58
  return self.document.metadata["chunk_type"]
59
 
 
 
 
 
60
  model_config = ConfigDict(arbitrary_types_allowed=True)
61
 
62
 
@@ -97,10 +101,14 @@ class SearchResultPage(BaseModel):
97
  def best_score(self):
98
  return self.matched_chunk.score
99
 
 
 
 
 
100
  model_config = ConfigDict(arbitrary_types_allowed=True)
101
 
102
 
103
- class PresentationSearchResult(BaseModel):
104
  """Container for presentation-level search results
105
 
106
  Represents all matching slides from a single presentation
@@ -166,7 +174,7 @@ class SlideIndexer:
166
  metadata = dict(
167
  # Basic slide info
168
  pdf_path=str(slide.pdf_path),
169
- page_num=str(slide.page_num),
170
  # Chunk specific
171
  chunk_type=chunk_type,
172
  slide_id=f"{slide.pdf_path.stem}__{slide.page_num}",
@@ -453,7 +461,7 @@ class ChromaSlideStore:
453
  search_results = self.search_query(
454
  query=query,
455
  chunk_types=chunk_types,
456
- n_results=n_results * 5, # Get more to handle duplicates
457
  max_score=max_distance,
458
  metadata_filter=metadata_filter,
459
  )
@@ -500,20 +508,20 @@ class ChromaSlideStore:
500
  )
501
  page_results.append(result)
502
 
503
- if len(page_results) == n_results:
504
- break
505
 
506
- return page_results[:n_results]
507
 
508
  def search_query_presentations(
509
  self,
510
  query: str,
511
  chunk_types: Optional[List[str]] = None,
512
  n_results: int = 3,
513
- n_slides_per_presentation: int = 4,
514
  max_distance: float = 2.0,
515
  metadata_filter: Optional[Dict] = None,
516
- ) -> List[PresentationSearchResult]:
517
  """Search presentations based on query and return grouped results
518
 
519
  Args:
@@ -532,7 +540,7 @@ class ChromaSlideStore:
532
  search_results = self.search_query_pages(
533
  query=query,
534
  chunk_types=chunk_types,
535
- n_results=n_results * n_slides_per_presentation * 2,
536
  max_distance=max_distance,
537
  metadata_filter=metadata_filter,
538
  )
@@ -553,12 +561,12 @@ class ChromaSlideStore:
553
  if len(presentations_map[pres_name]) < n_slides_per_presentation:
554
  presentations_map[pres_name].append(result)
555
 
556
- # Convert to PresentationSearchResult objects
557
  presentation_results = []
558
 
559
  for pres_name, slides in presentations_map.items():
560
  # Create presentation result
561
- pres_result = PresentationSearchResult(
562
  slides=slides,
563
  # NOTE: This is only for testing. Can be removed
564
  metadata=dict(
@@ -570,13 +578,13 @@ class ChromaSlideStore:
570
  )
571
  presentation_results.append(pres_result)
572
 
573
- if len(presentation_results) == n_results:
574
- break
575
 
576
  # TODO: Gotta check different ways to sort
577
  presentation_results.sort(key=lambda x: x.mean_score)
578
 
579
- return presentation_results[:n_results]
580
 
581
  def get_by_metadata(
582
  self, metadata_filter: Dict, n_results: Optional[int] = None
 
57
  """Get chunk type from metadata"""
58
  return self.document.metadata["chunk_type"]
59
 
60
+ @property
61
+ def page_num(self) -> int:
62
+ return int(self.document.metadata["page_num"])
63
+
64
  model_config = ConfigDict(arbitrary_types_allowed=True)
65
 
66
 
 
101
  def best_score(self):
102
  return self.matched_chunk.score
103
 
104
+ @property
105
+ def page_num(self):
106
+ return self.matched_chunk.page_num
107
+
108
  model_config = ConfigDict(arbitrary_types_allowed=True)
109
 
110
 
111
+ class SearchResultPresentation(BaseModel):
112
  """Container for presentation-level search results
113
 
114
  Represents all matching slides from a single presentation
 
174
  metadata = dict(
175
  # Basic slide info
176
  pdf_path=str(slide.pdf_path),
177
+ page_num=str(slide.page_num), # BUG: why str?
178
  # Chunk specific
179
  chunk_type=chunk_type,
180
  slide_id=f"{slide.pdf_path.stem}__{slide.page_num}",
 
461
  search_results = self.search_query(
462
  query=query,
463
  chunk_types=chunk_types,
464
+ n_results=n_results * 3, # Get more to handle duplicates
465
  max_score=max_distance,
466
  metadata_filter=metadata_filter,
467
  )
 
508
  )
509
  page_results.append(result)
510
 
511
+ # if len(page_results) == n_results:
512
+ # break
513
 
514
+ return page_results # [:n_results]
515
 
516
  def search_query_presentations(
517
  self,
518
  query: str,
519
  chunk_types: Optional[List[str]] = None,
520
  n_results: int = 3,
521
+ n_slides_per_presentation: int = 3,
522
  max_distance: float = 2.0,
523
  metadata_filter: Optional[Dict] = None,
524
+ ) -> List[SearchResultPresentation]:
525
  """Search presentations based on query and return grouped results
526
 
527
  Args:
 
540
  search_results = self.search_query_pages(
541
  query=query,
542
  chunk_types=chunk_types,
543
+ n_results=n_results * n_slides_per_presentation,
544
  max_distance=max_distance,
545
  metadata_filter=metadata_filter,
546
  )
 
561
  if len(presentations_map[pres_name]) < n_slides_per_presentation:
562
  presentations_map[pres_name].append(result)
563
 
564
+ # Convert to SearchResultPresentation objects
565
  presentation_results = []
566
 
567
  for pres_name, slides in presentations_map.items():
568
  # Create presentation result
569
+ pres_result = SearchResultPresentation(
570
  slides=slides,
571
  # NOTE: This is only for testing. Can be removed
572
  metadata=dict(
 
578
  )
579
  presentation_results.append(pres_result)
580
 
581
+ # if len(presentation_results) == n_results:
582
+ # break
583
 
584
  # TODO: Gotta check different ways to sort
585
  presentation_results.sort(key=lambda x: x.mean_score)
586
 
587
+ return presentation_results # [:n_results]
588
 
589
  def get_by_metadata(
590
  self, metadata_filter: Dict, n_results: Optional[int] = None
src/webapp/__init__.py ADDED
File without changes
src/webapp/app.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+ from textwrap import dedent
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ from gradio_pdf import PDF
10
+ from pymupdf.mupdf import ll_pdf_annot_modification_date
11
+
12
+ from src.config import Config, Navigator
13
+ from src.rag.storage import ChromaSlideStore, SearchResultPage, SearchResultPresentation
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def format_page_results(result_page: SearchResultPage) -> str:
19
+ """Format individual slide results as markdown text"""
20
+ chunks = result_page.slide_chunks
21
+
22
+ text = dedent(
23
+ f"""\
24
+ ### Page: {result_page.page_num+1}
25
+ **Best matching chunk:** `{result_page.matched_chunk.chunk_type}`\\
26
+ **Chunk distances:** {result_page.matched_chunk.score:.4f}
27
+ """
28
+ )
29
+
30
+ # chunk_distances_str = ""
31
+ # for chunk_type, distance in result_page.chunk_distances.items():
32
+ # distance_str = f"{distance:.4f}" if distance else "`not matched`"
33
+ # chunk_distances_str += f"{chunk_type}: {distance_str}\\\n"
34
+
35
+ chunk_df = (
36
+ pd.DataFrame(result_page.chunk_distances, index=["distance"])
37
+ .T.assign(
38
+ distance=lambda df_: df_["distance"].apply(
39
+ lambda x: f"{x:.4f}" if x is not None else "not matched"
40
+ )
41
+ )
42
+ .reset_index(names="chunk type")
43
+ .sort_values("distance")
44
+ )
45
+ chunk_distances_str = chunk_df.to_markdown(index=False)
46
+ text += f"\n{chunk_distances_str}\n"
47
+
48
+ # Add matched chunks info
49
+ text += "#### Content:\n"
50
+ for i, (chunk_type, distance) in chunk_df.iterrows():
51
+ if distance != "not matched":
52
+ text += f"`{chunk_type}` d={distance}\n"
53
+
54
+ # Create an embed for text
55
+ chunk_text = chunks[chunk_type].page_content.replace("\n", "\n>\n> ")
56
+ chunk_text = "> " + chunk_text + "\n\n" # Include first line into embed
57
+ text += chunk_text
58
+
59
+ return text
60
+
61
+
62
+ def format_presentation_results(
63
+ pres_result: SearchResultPresentation,
64
+ ) -> Tuple[str, Path, int]:
65
+ """Format single presentation results"""
66
+ # Get best matching page
67
+ best_slide = pres_result.best_slide
68
+ pdf_path = Path(best_slide.pdf_path)
69
+ page_num = int(best_slide.page_num)
70
+
71
+ page_nums = [s.page_num + 1 for s in pres_result.slides]
72
+ page_scores = [s.best_score for s in pres_result.slides]
73
+ df = pd.DataFrame(
74
+ dict(
75
+ page_nums=page_nums,
76
+ page_scores=[f"{x:.4f}" for x in page_scores],
77
+ )
78
+ )
79
+
80
+ df_string = df.to_markdown(index=False)
81
+
82
+ # Format header
83
+ text = f"## {pdf_path.stem}\n"
84
+ text += f"\n{df_string}\n\n"
85
+ text += f"**Mean Score:** {pres_result.mean_score:.4f}\n"
86
+
87
+ # Format individual slides
88
+ for slide in pres_result.slides:
89
+ text += format_page_results(slide)
90
+ text += "\n\n---\n\n"
91
+
92
+ return text, pdf_path, page_num
93
+
94
+
95
+ class RagInterface:
96
+ """Gradio interface for RAG application"""
97
+
98
+ def __init__(self, store: ChromaSlideStore, config: Optional[Config] = None):
99
+ """Initialize interface
100
+
101
+ Args:
102
+ store: Configured vector store
103
+ config: Optional application config
104
+ """
105
+ self.store = store
106
+ self.config = config or Config()
107
+ self.nav = self.config.navigator
108
+
109
+ # Create interface
110
+ self.interface = gr.Blocks()
111
+ self._build_interface()
112
+
113
+ def _build_interface(self):
114
+ """Build Gradio interface layout"""
115
+ with self.interface:
116
+ gr.Markdown("# Presentation Search")
117
+
118
+ with gr.Row():
119
+ # Input components
120
+ with gr.Column(scale=2):
121
+ query = gr.Textbox(
122
+ label="Search Query",
123
+ placeholder="Enter your search query...",
124
+ lines=3,
125
+ )
126
+ with gr.Row():
127
+ n_results = gr.Number(
128
+ label="Number of Presentations",
129
+ scale=1,
130
+ minimum=1,
131
+ maximum=10,
132
+ value=3,
133
+ step=1,
134
+ )
135
+ n_pages_per_pres = gr.Number(
136
+ label="Number of pages per presentation",
137
+ scale=1,
138
+ minimum=1,
139
+ maximum=5,
140
+ value=2,
141
+ step=1,
142
+ )
143
+ max_distance = gr.Number(
144
+ label="Maximum Distance",
145
+ scale=1,
146
+ minimum=0.1,
147
+ maximum=2.0,
148
+ value=2.0,
149
+ step=0.1,
150
+ )
151
+
152
+ search_btn = gr.Button("Search", size="lg", scale=3)
153
+
154
+ # Results container
155
+ with gr.Column(scale=3):
156
+ with gr.Tabs() as results_tabs:
157
+ # Create 3 identical result tabs
158
+ result_components = []
159
+ for i in range(3):
160
+ with gr.Tab(f"Result {i+1}"):
161
+ with gr.Column():
162
+ # PDF viewer
163
+ pdf = PDF(
164
+ label="Presentation",
165
+ height=500,
166
+ interactive=False,
167
+ visible=False,
168
+ )
169
+ # Results text
170
+ results = gr.Markdown(
171
+ label="Search Results", visible=False
172
+ )
173
+ result_components.append((pdf, results))
174
+
175
+ # Wire up the search function
176
+ search_btn.click(
177
+ fn=self._search,
178
+ inputs=[query, n_results, n_pages_per_pres, max_distance],
179
+ outputs=[item for pair in result_components for item in pair],
180
+ )
181
+
182
+ def _search(
183
+ self,
184
+ query: str,
185
+ n_results: int,
186
+ n_pages: int,
187
+ max_distance: float,
188
+ ) -> List[gr.components.Component]:
189
+ """Search presentations and format results
190
+
191
+ Args:
192
+ query: Search query text
193
+ n_results: Number of presentations to return
194
+ max_distance: Maximum cosine distance threshold
195
+
196
+ Returns:
197
+ List of components to update in UI
198
+ """
199
+ try:
200
+ # Search presentations
201
+ results = self.store.search_query_presentations(
202
+ query=query,
203
+ n_results=n_results,
204
+ max_distance=max_distance,
205
+ n_slides_per_presentation=n_pages,
206
+ )
207
+
208
+ # Prepare outputs for all possible tabs
209
+ outputs = []
210
+ for i in range(3):
211
+ if i < len(results):
212
+ # Format this result
213
+ text, pdf_path, page = format_presentation_results(results[i])
214
+
215
+ # Add components: PDF viewer and results text
216
+ outputs.extend(
217
+ [
218
+ # PDF component
219
+ PDF(
220
+ value=str(pdf_path),
221
+ starting_page=page
222
+ + 1, # Pages are 0-based in store but 1-based in PDF
223
+ visible=True,
224
+ ),
225
+ # Results text
226
+ gr.Markdown(value=text, visible=True),
227
+ ]
228
+ )
229
+ else:
230
+ # Hide unused tabs
231
+ outputs.extend(
232
+ [
233
+ PDF(visible=False),
234
+ gr.Markdown(visible=False),
235
+ ]
236
+ )
237
+
238
+ return outputs
239
+
240
+ except Exception as e:
241
+ logger.exception("Search failed")
242
+ # Return empty results on error
243
+ return [PDF(visible=False), gr.Markdown(visible=False)] * 3
244
+
245
+ def launch(self, **kwargs):
246
+ """Launch the Gradio interface"""
247
+ self.interface.launch(**kwargs)
248
+
249
+
250
+ def run_app(store: ChromaSlideStore, **kwargs):
251
+ """Run Gradio application
252
+
253
+ Args:
254
+ store: Configured ChromaSlideStore instance
255
+ **kwargs: Additional arguments for Gradio launch
256
+ """
257
+ viewer = RagInterface(store)
258
+ viewer.launch(**kwargs)
259
+
260
+
261
+ def main():
262
+ """Run presentation search web application"""
263
+ # Load environment
264
+ from dotenv import load_dotenv
265
+
266
+ load_dotenv()
267
+
268
+ # Parse arguments
269
+ parser = argparse.ArgumentParser()
270
+ parser.add_argument(
271
+ "--collection", default="pres0", help="ChromaDB collection name"
272
+ )
273
+ parser.add_argument("--host", default="0.0.0.0", help="Host to run on")
274
+ parser.add_argument("--port", type=int, default=7860, help="Port to run on")
275
+ parser.add_argument("--share", action="store_true", help="Create public link")
276
+ args = parser.parse_args()
277
+
278
+ # Initialize store
279
+ store = ChromaSlideStore(collection_name=args.collection)
280
+
281
+ # Run app
282
+ run_app(store, server_name=args.host, server_port=args.port, share=args.share)
283
+
284
+
285
+ if __name__ == "__main__":
286
+ main()