Spaces:
Sleeping
Sleeping
| """ | |
| Gradio demo wiring: input question -> retrieve -> compose_answer -> show quotes. | |
| """ | |
| from pathlib import Path | |
| import yaml | |
| import numpy as np | |
| import faiss | |
| import gradio as gr | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| from src.embed_index import load_index | |
| from src.retrieve import retrieve | |
| from src.compose import compose_answer | |
| def load_config(config_path="../configs/app.yaml"): | |
| """Load configuration from YAML file.""" | |
| with open(config_path, 'r', encoding='utf-8') as f: | |
| return yaml.safe_load(f) | |
| def embed_query(query: str, model: SentenceTransformer) -> np.ndarray: | |
| """Embed a query string using the model. Returns normalized embedding.""" | |
| embedding = model.encode([query], normalize_embeddings=True, show_progress_bar=False) | |
| embedding = np.array(embedding, dtype=np.float32) | |
| faiss.normalize_L2(embedding) # Normalize for IndexFlatIP | |
| return embedding[0] # Return 1D array (retrieve expects this) | |
| def is_toc_or_header_chunk(result: dict) -> bool: | |
| """ | |
| Detect if a chunk is a TOC, header, or low-content chunk. | |
| Returns True if it should be filtered out. | |
| """ | |
| text = result.get('text', '') | |
| chunk_id = result.get('chunk_id', '') | |
| meta = result.get('meta', {}) | |
| # Filter out chunk 0 (usually TOC/preface) | |
| if chunk_id.endswith('_chunk_0') or meta.get('para_idx_start', -1) == 0: | |
| # But allow it if it has substantial content (not just TOC) | |
| if 'Contents' in text and text.count('CHAPTER') > 5: | |
| return True # It's a TOC | |
| # Filter very short chunks | |
| if len(text) < 150: | |
| return True | |
| # Filter chunks with too many newlines (indicates headers/TOC) | |
| newline_ratio = text.count('\n') / len(text) if len(text) > 0 else 0 | |
| if newline_ratio > 0.15: # More than 15% newlines | |
| return True | |
| # Filter chunks that are mostly chapter titles | |
| lines = text.split('\n') | |
| chapter_lines = [line for line in lines if 'CHAPTER' in line.upper() or | |
| re.match(r'^CHAPTER\s+[IVX]+', line, re.IGNORECASE)] | |
| if len(chapter_lines) > 3: # More than 3 chapter title lines | |
| return True | |
| # Filter chunks that start with title/author/contents pattern | |
| first_100 = text[:100].lower() | |
| if ('contents' in first_100 and 'chapter' in first_100) or \ | |
| (text.startswith('The Picture of') and 'by Oscar Wilde' in first_100): | |
| # Check if it's mostly TOC (many short lines) | |
| short_lines = [line for line in lines[:30] if len(line.strip()) < 50] | |
| if len(short_lines) > 10: # More than 10 short lines in first 30 | |
| return True | |
| return False | |
| def filter_results(results: list, filter_toc: bool = True) -> list: | |
| """ | |
| Filter out TOC/header chunks from retrieval results. | |
| Args: | |
| results: List of retrieved chunk dicts | |
| filter_toc: Whether to apply TOC/header filtering | |
| Returns: | |
| Filtered list of results | |
| """ | |
| if not filter_toc: | |
| return results | |
| filtered = [r for r in results if not is_toc_or_header_chunk(r)] | |
| # If filtering removed all results, return original (better than nothing) | |
| if not filtered and results: | |
| return results | |
| return filtered | |
| def format_composed_answer(composed: dict) -> str: | |
| """ | |
| Format composed answer with citations as markdown for display. | |
| """ | |
| output = f"## Answer\n\n{composed['answer']}\n\n" | |
| if composed.get('references'): | |
| output += "## Evidence\n\n" | |
| for ref in composed['references']: | |
| output += f"{ref}\n\n" | |
| return output | |
| def predict(query: str, index, metadata_df, model: SentenceTransformer, config, | |
| chunks_lookup: dict = None, filter_toc: bool = True): | |
| """ | |
| Main prediction function: retrieve chunks, compose answer, and format for display. | |
| Args: | |
| query: User's question | |
| index: FAISS index | |
| metadata_df: Metadata DataFrame | |
| model: SentenceTransformer model | |
| config: Configuration dict | |
| chunks_lookup: Dict mapping chunk_id to chunk data | |
| filter_toc: Whether to filter out TOC/header chunks | |
| Returns: | |
| Formatted markdown string with answer and citations | |
| """ | |
| if not query or not query.strip(): | |
| return "Please enter a question." | |
| k = config.get('top_k', 5) | |
| max_quotes = config.get('max_answer_tokens', 300) // 100 # Rough estimate: ~3 quotes | |
| # Create embedding function for retrieve() | |
| def embed_fn(q: str) -> np.ndarray: | |
| return embed_query(q, model) | |
| # Retrieve top-k chunks using the retrieve() function | |
| try: | |
| retrieved = retrieve( | |
| query=query, | |
| index=index, | |
| embed_fn=embed_fn, | |
| metadata_df=metadata_df, | |
| chunks_lookup=chunks_lookup, | |
| k=k | |
| ) | |
| if not retrieved: | |
| return "No results found. Try a different query." | |
| # Filter out TOC/header chunks if enabled | |
| if filter_toc: | |
| retrieved = filter_results(retrieved, filter_toc=True) | |
| if not retrieved: | |
| return "No relevant content found after filtering. Try a different query." | |
| # Compose answer using retrieved chunks | |
| try: | |
| composed = compose_answer(query, retrieved, max_quotes=max_quotes) | |
| output = format_composed_answer(composed) | |
| return output | |
| except Exception as compose_error: | |
| # Fallback: show raw retrieval results if composition fails | |
| error_msg = f"Error composing answer: {compose_error}\n\n" | |
| error_msg += f"Retrieved {len(retrieved)} chunks. Showing top result:\n\n" | |
| if retrieved: | |
| top_result = retrieved[0] | |
| error_msg += f"**Chunk:** {top_result.get('chunk_id', 'unknown')}\n" | |
| error_msg += f"**Score:** {top_result.get('score', 0):.4f}\n" | |
| error_msg += f"**Text:** {top_result.get('text', '')[:300]}...\n" | |
| return error_msg | |
| except Exception as e: | |
| return f"Error processing query: {str(e)}\n\nPlease try rephrasing your question." | |
| def launch_app(config_path="../configs/app.yaml", index_dir="../data/index"): | |
| """ | |
| Start a Gradio Interface for the RAG system. | |
| Args: | |
| config_path: Path to config YAML file | |
| index_dir: Directory containing the FAISS index and metadata | |
| Returns: | |
| Gradio Interface object | |
| """ | |
| # Load configuration | |
| config = load_config(config_path) | |
| print("π Loading FAISS index and metadata...") | |
| index, metadata_df = load_index(index_dir) | |
| print(f"π€ Loading embedding model: {config['embedding_model']}...") | |
| model = SentenceTransformer(config['embedding_model']) | |
| # Load chunks data for retrieve() function (needs text for compose_answer) | |
| chunks_lookup = None | |
| try: | |
| import json | |
| book_name = config['book'] | |
| chunks_file = Path(f"data/interim/chunks/{book_name}_chunks.json") | |
| if chunks_file.exists(): | |
| with open(chunks_file, 'r', encoding='utf-8') as f: | |
| chunks_list = json.load(f) | |
| chunks_lookup = {chunk['id']: chunk for chunk in chunks_list} | |
| print(f"β Loaded {len(chunks_lookup)} chunks for retrieval and composition") | |
| else: | |
| print(f"β οΈ Chunks file not found: {chunks_file}") | |
| print(" Retrieval will work but compose_answer may not have chunk text") | |
| except Exception as e: | |
| print(f"β οΈ Could not load chunks data: {e}") | |
| print(" Retrieval will work but compose_answer may not have chunk text") | |
| # Create prediction function with loaded resources | |
| def predict_wrapper(query: str): | |
| return predict(query, index, metadata_df, model, config, chunks_lookup, filter_toc=True) | |
| # Create Gradio interface | |
| interface = gr.Interface( | |
| fn=predict_wrapper, | |
| inputs=gr.Textbox( | |
| label="Question", | |
| placeholder="Ask a question about the book...", | |
| lines=2 | |
| ), | |
| outputs=gr.Markdown(label="Answer & Evidence"), | |
| title="π Classics RAG Q&A", | |
| description=f""" | |
| Ask questions about **{config['book'].title()}**! | |
| This system uses semantic search to find relevant passages and compose answers with verbatim citations. | |
| **Tips for better results:** | |
| - Ask specific, concrete questions | |
| - Use descriptive queries about characters, objects, or events | |
| - The system automatically filters out table-of-contents and headers | |
| """, | |
| examples=[ | |
| "What does the portrait of Dorian Gray look like?", | |
| "How does Basil describe meeting Dorian for the first time?", | |
| "What does Lord Henry say about beauty and intellect?", | |
| "Why doesn't Basil want to exhibit the portrait?", | |
| ] if config['book'] == 'dorian' else [ | |
| "How does Homer portray Achilles' anger in Book 1?", | |
| "What happens in the first book of the Iliad?", | |
| "Describe the shield of Achilles.", | |
| "What is the conflict between Agamemnon and Achilles?", | |
| ], | |
| theme=gr.themes.Soft(), | |
| ) | |
| print("β Gradio interface ready!") | |
| return interface | |
| if __name__ == "__main__": | |
| interface = launch_app() | |
| interface.launch(share=False, server_name="0.0.0.0", server_port=7860) | |