Tuminha commited on
Commit
150db75
Β·
verified Β·
1 Parent(s): 5a9b680

Upload src/app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/app.py +259 -0
src/app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo wiring: input question -> retrieve -> compose_answer -> show quotes.
3
+ """
4
+ from pathlib import Path
5
+ import yaml
6
+ import numpy as np
7
+ import faiss
8
+ import gradio as gr
9
+ import re
10
+ from sentence_transformers import SentenceTransformer
11
+ from src.embed_index import load_index
12
+ from src.retrieve import retrieve
13
+ from src.compose import compose_answer
14
+
15
+
16
+ def load_config(config_path="../configs/app.yaml"):
17
+ """Load configuration from YAML file."""
18
+ with open(config_path, 'r', encoding='utf-8') as f:
19
+ return yaml.safe_load(f)
20
+
21
+
22
+ def embed_query(query: str, model: SentenceTransformer) -> np.ndarray:
23
+ """Embed a query string using the model. Returns normalized embedding."""
24
+ embedding = model.encode([query], normalize_embeddings=True, show_progress_bar=False)
25
+ embedding = np.array(embedding, dtype=np.float32)
26
+ faiss.normalize_L2(embedding) # Normalize for IndexFlatIP
27
+ return embedding[0] # Return 1D array (retrieve expects this)
28
+
29
+
30
+ def is_toc_or_header_chunk(result: dict) -> bool:
31
+ """
32
+ Detect if a chunk is a TOC, header, or low-content chunk.
33
+ Returns True if it should be filtered out.
34
+ """
35
+ text = result.get('text', '')
36
+ chunk_id = result.get('chunk_id', '')
37
+ meta = result.get('meta', {})
38
+
39
+ # Filter out chunk 0 (usually TOC/preface)
40
+ if chunk_id.endswith('_chunk_0') or meta.get('para_idx_start', -1) == 0:
41
+ # But allow it if it has substantial content (not just TOC)
42
+ if 'Contents' in text and text.count('CHAPTER') > 5:
43
+ return True # It's a TOC
44
+
45
+ # Filter very short chunks
46
+ if len(text) < 150:
47
+ return True
48
+
49
+ # Filter chunks with too many newlines (indicates headers/TOC)
50
+ newline_ratio = text.count('\n') / len(text) if len(text) > 0 else 0
51
+ if newline_ratio > 0.15: # More than 15% newlines
52
+ return True
53
+
54
+ # Filter chunks that are mostly chapter titles
55
+ lines = text.split('\n')
56
+ chapter_lines = [line for line in lines if 'CHAPTER' in line.upper() or
57
+ re.match(r'^CHAPTER\s+[IVX]+', line, re.IGNORECASE)]
58
+ if len(chapter_lines) > 3: # More than 3 chapter title lines
59
+ return True
60
+
61
+ # Filter chunks that start with title/author/contents pattern
62
+ first_100 = text[:100].lower()
63
+ if ('contents' in first_100 and 'chapter' in first_100) or \
64
+ (text.startswith('The Picture of') and 'by Oscar Wilde' in first_100):
65
+ # Check if it's mostly TOC (many short lines)
66
+ short_lines = [line for line in lines[:30] if len(line.strip()) < 50]
67
+ if len(short_lines) > 10: # More than 10 short lines in first 30
68
+ return True
69
+
70
+ return False
71
+
72
+
73
+ def filter_results(results: list, filter_toc: bool = True) -> list:
74
+ """
75
+ Filter out TOC/header chunks from retrieval results.
76
+
77
+ Args:
78
+ results: List of retrieved chunk dicts
79
+ filter_toc: Whether to apply TOC/header filtering
80
+
81
+ Returns:
82
+ Filtered list of results
83
+ """
84
+ if not filter_toc:
85
+ return results
86
+
87
+ filtered = [r for r in results if not is_toc_or_header_chunk(r)]
88
+
89
+ # If filtering removed all results, return original (better than nothing)
90
+ if not filtered and results:
91
+ return results
92
+
93
+ return filtered
94
+
95
+
96
+ def format_composed_answer(composed: dict) -> str:
97
+ """
98
+ Format composed answer with citations as markdown for display.
99
+ """
100
+ output = f"## Answer\n\n{composed['answer']}\n\n"
101
+
102
+ if composed.get('references'):
103
+ output += "## Evidence\n\n"
104
+ for ref in composed['references']:
105
+ output += f"{ref}\n\n"
106
+
107
+ return output
108
+
109
+
110
+ def predict(query: str, index, metadata_df, model: SentenceTransformer, config,
111
+ chunks_lookup: dict = None, filter_toc: bool = True):
112
+ """
113
+ Main prediction function: retrieve chunks, compose answer, and format for display.
114
+
115
+ Args:
116
+ query: User's question
117
+ index: FAISS index
118
+ metadata_df: Metadata DataFrame
119
+ model: SentenceTransformer model
120
+ config: Configuration dict
121
+ chunks_lookup: Dict mapping chunk_id to chunk data
122
+ filter_toc: Whether to filter out TOC/header chunks
123
+
124
+ Returns:
125
+ Formatted markdown string with answer and citations
126
+ """
127
+ if not query or not query.strip():
128
+ return "Please enter a question."
129
+
130
+ k = config.get('top_k', 5)
131
+ max_quotes = config.get('max_answer_tokens', 300) // 100 # Rough estimate: ~3 quotes
132
+
133
+ # Create embedding function for retrieve()
134
+ def embed_fn(q: str) -> np.ndarray:
135
+ return embed_query(q, model)
136
+
137
+ # Retrieve top-k chunks using the retrieve() function
138
+ try:
139
+ retrieved = retrieve(
140
+ query=query,
141
+ index=index,
142
+ embed_fn=embed_fn,
143
+ metadata_df=metadata_df,
144
+ chunks_lookup=chunks_lookup,
145
+ k=k
146
+ )
147
+
148
+ if not retrieved:
149
+ return "No results found. Try a different query."
150
+
151
+ # Filter out TOC/header chunks if enabled
152
+ if filter_toc:
153
+ retrieved = filter_results(retrieved, filter_toc=True)
154
+ if not retrieved:
155
+ return "No relevant content found after filtering. Try a different query."
156
+
157
+ # Compose answer using retrieved chunks
158
+ try:
159
+ composed = compose_answer(query, retrieved, max_quotes=max_quotes)
160
+ output = format_composed_answer(composed)
161
+ return output
162
+ except Exception as compose_error:
163
+ # Fallback: show raw retrieval results if composition fails
164
+ error_msg = f"Error composing answer: {compose_error}\n\n"
165
+ error_msg += f"Retrieved {len(retrieved)} chunks. Showing top result:\n\n"
166
+ if retrieved:
167
+ top_result = retrieved[0]
168
+ error_msg += f"**Chunk:** {top_result.get('chunk_id', 'unknown')}\n"
169
+ error_msg += f"**Score:** {top_result.get('score', 0):.4f}\n"
170
+ error_msg += f"**Text:** {top_result.get('text', '')[:300]}...\n"
171
+ return error_msg
172
+
173
+ except Exception as e:
174
+ return f"Error processing query: {str(e)}\n\nPlease try rephrasing your question."
175
+
176
+
177
+ def launch_app(config_path="../configs/app.yaml", index_dir="../data/index"):
178
+ """
179
+ Start a Gradio Interface for the RAG system.
180
+
181
+ Args:
182
+ config_path: Path to config YAML file
183
+ index_dir: Directory containing the FAISS index and metadata
184
+
185
+ Returns:
186
+ Gradio Interface object
187
+ """
188
+ # Load configuration
189
+ config = load_config(config_path)
190
+
191
+ print("πŸ“š Loading FAISS index and metadata...")
192
+ index, metadata_df = load_index(index_dir)
193
+
194
+ print(f"πŸ€– Loading embedding model: {config['embedding_model']}...")
195
+ model = SentenceTransformer(config['embedding_model'])
196
+
197
+ # Load chunks data for retrieve() function (needs text for compose_answer)
198
+ chunks_lookup = None
199
+ try:
200
+ import json
201
+ book_name = config['book']
202
+ chunks_file = Path(f"data/interim/chunks/{book_name}_chunks.json")
203
+ if chunks_file.exists():
204
+ with open(chunks_file, 'r', encoding='utf-8') as f:
205
+ chunks_list = json.load(f)
206
+ chunks_lookup = {chunk['id']: chunk for chunk in chunks_list}
207
+ print(f"βœ… Loaded {len(chunks_lookup)} chunks for retrieval and composition")
208
+ else:
209
+ print(f"⚠️ Chunks file not found: {chunks_file}")
210
+ print(" Retrieval will work but compose_answer may not have chunk text")
211
+ except Exception as e:
212
+ print(f"⚠️ Could not load chunks data: {e}")
213
+ print(" Retrieval will work but compose_answer may not have chunk text")
214
+
215
+ # Create prediction function with loaded resources
216
+ def predict_wrapper(query: str):
217
+ return predict(query, index, metadata_df, model, config, chunks_lookup, filter_toc=True)
218
+
219
+ # Create Gradio interface
220
+ interface = gr.Interface(
221
+ fn=predict_wrapper,
222
+ inputs=gr.Textbox(
223
+ label="Question",
224
+ placeholder="Ask a question about the book...",
225
+ lines=2
226
+ ),
227
+ outputs=gr.Markdown(label="Answer & Evidence"),
228
+ title="πŸ“š Classics RAG Q&A",
229
+ description=f"""
230
+ Ask questions about **{config['book'].title()}**!
231
+
232
+ This system uses semantic search to find relevant passages and compose answers with verbatim citations.
233
+
234
+ **Tips for better results:**
235
+ - Ask specific, concrete questions
236
+ - Use descriptive queries about characters, objects, or events
237
+ - The system automatically filters out table-of-contents and headers
238
+ """,
239
+ examples=[
240
+ "What does the portrait of Dorian Gray look like?",
241
+ "How does Basil describe meeting Dorian for the first time?",
242
+ "What does Lord Henry say about beauty and intellect?",
243
+ "Why doesn't Basil want to exhibit the portrait?",
244
+ ] if config['book'] == 'dorian' else [
245
+ "How does Homer portray Achilles' anger in Book 1?",
246
+ "What happens in the first book of the Iliad?",
247
+ "Describe the shield of Achilles.",
248
+ "What is the conflict between Agamemnon and Achilles?",
249
+ ],
250
+ theme=gr.themes.Soft(),
251
+ )
252
+
253
+ print("βœ… Gradio interface ready!")
254
+ return interface
255
+
256
+
257
+ if __name__ == "__main__":
258
+ interface = launch_app()
259
+ interface.launch(share=False, server_name="0.0.0.0", server_port=7860)