Spaces:
Build error
Build error
File size: 10,805 Bytes
43554ac 087cf3c 43554ac eb1873f 43554ac eb1873f 43554ac eb1873f 43554ac 8dae32e 43554ac 0ed2f77 43554ac 8dae32e bc2b128 8dae32e 43554ac 0ed2f77 43554ac bc2b128 43554ac bc2b128 43554ac eb1873f 0ed2f77 eb1873f 0ed2f77 feb545e 0ed2f77 bc2b128 0ed2f77 bc2b128 0ed2f77 188b538 eb1873f 43554ac eb1873f 188b538 b105b1d feb545e 188b538 43554ac 0ed2f77 43554ac eb1873f 43554ac 0ed2f77 43554ac eb1873f 43554ac feb545e c558638 feb545e c558638 feb545e eb1873f 43554ac eb1873f 3db0398 0ed2f77 1be9542 3db0398 1be9542 3db0398 1be9542 3db0398 0ed2f77 3db0398 0ed2f77 3db0398 0ed2f77 3db0398 bc2b128 1be9542 bc2b128 0ed2f77 3db0398 1be9542 3db0398 eb1873f 3db0398 0ed2f77 1be9542 3db0398 1be9542 3db0398 43554ac 087cf3c 43554ac fdfe2ec 087cf3c 43554ac fdfe2ec 43554ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | import argparse
import logging
import os
from pathlib import Path
from textwrap import dedent
from typing import Dict, List, Optional, Tuple
import gradio as gr
import pandas as pd
from gradio.components import Component
from gradio_pdf import PDF
from pydantic import BaseModel
from pymupdf.mupdf import ll_pdf_annot_modification_date
from src.config import Config, Navigator
from src.rag.storage import ChromaSlideStore, SearchResultPage, SearchResultPresentation
logger = logging.getLogger(__name__)
def format_page_results(result_page: SearchResultPage) -> str:
"""Format individual slide results as markdown
text specifically for the webapp.
"""
chunks = result_page.slide_chunks
text = dedent(
f"""\
### Page: {result_page.page_num+1}
**Best matching chunk:** `{result_page.matched_chunk.chunk_type}`\\
**Chunk distances:**
"""
)
chunk_df = (
pd.DataFrame(result_page.chunk_distances, index=["distance"])
.T.assign(
distance=lambda df_: df_["distance"].apply(
lambda x: f"{x:.4f}" if x is not None else "not matched"
)
)
.reset_index(names="chunk type")
.sort_values("distance")
)
chunk_distances_str = chunk_df.to_markdown(index=False)
text += f"\n{chunk_distances_str}\n"
# Add matched chunks info
text += "#### Content:\n"
for i, (chunk_type, distance) in chunk_df.iterrows():
if distance != "not matched":
text += f"`{chunk_type}` d={distance}\n"
# Create an embed for text
chunk_text = chunks[chunk_type].page_content.replace("\n", "\n>\n> ")
chunk_text = "> " + chunk_text + "\n\n" # Include first line into embed
text += chunk_text
return text
def format_presentation_results(
pres_result: SearchResultPresentation, n_pages: Optional[int] = None
) -> str:
"""Format single presentation results specifically for the webapp"""
# Get best matching page
best_slide = pres_result.best_slide
pdf_path = Path(best_slide.pdf_path)
page_num = int(best_slide.page_num)
page_nums = [s.page_num + 1 for s in pres_result.slides]
page_scores = [s.best_score for s in pres_result.slides]
df = pd.DataFrame(
dict(
page_nums=page_nums,
page_scores=[f"{x:.4f}" for x in page_scores],
)
)
df_string = df.to_markdown(index=False)
# Format header
text = f"## {pdf_path.stem}\n"
text += f"\n{df_string}\n\n"
text += f"**Rank Score:** {pres_result.rank_score:.4f}\n"
# Format individual slides
for i in range(n_pages or len(pres_result)):
text += format_page_results(pres_result[i])
text += "\n---\n\n"
return text
class RagInterface:
"""Gradio interface for RAG application"""
def __init__(self, store: ChromaSlideStore, config: Optional[Config] = None):
"""Initialize interface
Args:
store: Configured vector store
config: Optional application config
"""
self.store = store
self.config = config or Config()
self.nav = self.config.navigator
# Create interface
self.interface = gr.Blocks()
# Config
self.n_outputs = 7
self.output_height = 500
def rate_response(self, score: float):
best_threshold = 0.48
ok_threshold = 0.6
if score < best_threshold:
return "👍" # "💯"
if score < ok_threshold:
return "👌" # "¯\_(ツ)_/¯"
return "👎"
def calculate_params(self, search_depth: int):
return {1: 30, 2: 50, 3: 70, 4: 100, 5: 150}.get(search_depth, 70)
def launch(self, **kwargs):
"""Build Gradio interface layout"""
with gr.Blocks() as app:
# fmt: off
gr.Markdown(dedent(
"""\
# Presentation Search
- Presentations for search are available [at google-drive](https://drive.google.com/drive/folders/1IvUsxxtyyTuHdZff9szhd2OtIATRTcG4?usp=sharing)
- List of possible questions for testing is available [at google-sheets](https://docs.google.com/spreadsheets/d/1qWRF_o-RY1x-o-3z08iVb2akh0HS3ZNxVkZi6yoVsI4/edit?usp=sharing)
""")
)
# fmt: on
with gr.Row():
# Input components
with gr.Row():
query = gr.Textbox(
label="Search Query",
placeholder="Enter your search query...",
lines=3,
elem_id="query",
)
with gr.Column():
search_depth = gr.Slider(
label="Depth of Search",
scale=1,
minimum=1,
maximum=5,
value=3,
step=1,
)
search_btn = gr.Button("Search", size="lg", scale=3)
examples = gr.Examples(
examples=[
"Презентация с картинкой единорога",
"В какой презентации был график с экономическими событиями?",
"Презентация с мемом про Дональда Трампа",
"В какой презентации были красные круги с объемами рекламы?",
"В презентации было сравнение размеров Гренландии и Африки",
"Презентация про нетворкинг",
"Презентация от Яндекс",
"Презентация в которой показывали карту грузовых маршрутов для поездов",
"В какой презентации были скриншоты игры?",
"Презентация с примерами генерации картинок моделью Kandinsky",
"Презентации с котами",
],
inputs=[query],
examples_per_page=15,
)
# Adding results functionality
results = gr.State([])
# Results container
result_components = []
for i in range(self.n_outputs):
with gr.Group(visible=True) as g:
with gr.Tabs():
# Create 3 identical result tabs
with gr.Tab(f"Result {i+1}"):
with gr.Column():
# PDF viewer
pdf = PDF(
label="Presentation",
height=self.output_height,
interactive=False,
container=False,
visible=False,
)
with gr.Tab(f"Details"):
# Results text
with gr.Column(variant="panel"):
details_text = gr.Markdown(
label="Search Results",
height=self.output_height,
visible=False,
)
certainty = gr.Markdown()
result_components.extend([pdf, certainty, details_text])
def fill_components(inputs):
self.calculate_params(search_depth=inputs[search_depth])
new_results = self.store.search_query_presentations(
query=inputs[query],
)
outputs = []
for i in range(self.n_outputs):
if i < len(new_results):
pres_result = new_results[i]
text = format_presentation_results(pres_result)
pdf_path = pres_result.pdf_path
page = pres_result[0].page_num
g = gr.Group(visible=True)
pdf = PDF(
value=str(pdf_path), starting_page=page + 1, visible=True
)
certainty_symbol = self.rate_response(pres_result.rank_score)
certainty = gr.Markdown(
value=f"# Certainty: {certainty_symbol}", visible=True
)
description = gr.Markdown(value=text, visible=True)
else:
g = gr.Group(visible=False)
pdf = PDF(visible=False)
certainty = gr.Markdown(visible=False)
description = gr.Markdown(visible=False)
outputs.extend([pdf, certainty, description])
return outputs
# Wire up the search function
search_btn.click(
fn=fill_components,
inputs={query, search_depth},
outputs=result_components,
)
app.launch(ssr_mode=False, **kwargs)
def run_app(store: ChromaSlideStore, **kwargs):
"""Run Gradio application
Args:
store: Configured ChromaSlideStore instance
**kwargs: Additional arguments for Gradio launch
"""
viewer = RagInterface(store)
viewer.launch(**kwargs)
def main():
"""Run presentation search web application"""
# Load environment
from dotenv import load_dotenv
load_dotenv()
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--collection", default=None, help="ChromaDB collection name")
parser.add_argument("--host", default="0.0.0.0", help="Host to run on")
parser.add_argument("--port", type=int, default=7860, help="Port to run on")
parser.add_argument("--share", action="store_true", help="Create public link")
args = parser.parse_args()
# Load collection from dotenv if not specified
collection = os.getenv("CHROMA_COLLECTION_NAME") or args.collection
# Initialize store
store = ChromaSlideStore(
collection_name=collection
) # pyright: ignore[reportArgumentType]
# Run app
run_app(store, server_name=args.host, server_port=args.port, share=args.share)
if __name__ == "__main__":
main()
|