galbendavids commited on
Commit
a98f7cb
Β·
verified Β·
1 Parent(s): da458f9

optimize rag flow

Browse files
__pycache__/rag_engine.cpython-313.pyc CHANGED
Binary files a/__pycache__/rag_engine.cpython-313.pyc and b/__pycache__/rag_engine.cpython-313.pyc differ
 
__pycache__/rag_engine.cpython-37.pyc CHANGED
Binary files a/__pycache__/rag_engine.cpython-37.pyc and b/__pycache__/rag_engine.cpython-37.pyc differ
 
app.py CHANGED
@@ -16,6 +16,7 @@ Implemented Features:
16
 
17
  import gradio as gr
18
  import os
 
19
  from rag_engine import RAGEngine
20
 
21
  # Initialize RAG Engine - Loads data and builds indices
@@ -26,6 +27,8 @@ try:
26
  print(f"βœ… Engine ready with {len(engine.chunks)} smart chunks")
27
  except Exception as e:
28
  print(f"❌ Error initializing RAG Engine: {e}")
 
 
29
  engine = None
30
 
31
 
@@ -60,14 +63,17 @@ Get your key from: [Google AI Studio](https://aistudio.google.com/apikey)"""
60
  The RAG Engine failed to load. This usually means:
61
  - Data files are missing
62
  - Environment is misconfigured
 
63
 
64
- Please check the Space logs for details."""
 
 
 
65
 
66
  try:
67
  # Generate response using the RAG engine
68
  response = engine.generate_response(message, history, api_key)
69
  return response
70
-
71
  except Exception as e:
72
  return f"""❌ **Error Processing Query**
73
 
 
16
 
17
  import gradio as gr
18
  import os
19
+ import traceback
20
  from rag_engine import RAGEngine
21
 
22
  # Initialize RAG Engine - Loads data and builds indices
 
27
  print(f"βœ… Engine ready with {len(engine.chunks)} smart chunks")
28
  except Exception as e:
29
  print(f"❌ Error initializing RAG Engine: {e}")
30
+ print("Full traceback:")
31
+ traceback.print_exc()
32
  engine = None
33
 
34
 
 
63
  The RAG Engine failed to load. This usually means:
64
  - Data files are missing
65
  - Environment is misconfigured
66
+ - Check the Space logs for specific error details
67
 
68
+ Common solutions:
69
+ 1. Ensure data_ingestion/scraped_data.json exists
70
+ 2. Check that all dependencies are installed
71
+ 3. Verify the workspace path is correct"""
72
 
73
  try:
74
  # Generate response using the RAG engine
75
  response = engine.generate_response(message, history, api_key)
76
  return response
 
77
  except Exception as e:
78
  return f"""❌ **Error Processing Query**
79
 
rag_engine.py CHANGED
@@ -249,7 +249,7 @@ class RAGEngine:
249
  self.keyword_index[keyword].append(len(self.chunks) - 1)
250
 
251
  print(f"Created {len(self.chunks)} smart chunks from {len(raw_data)} articles with rich metadata.")
252
- self._build_index()
253
 
254
  def _extract_car_type(self, title: str) -> str:
255
  """Χ–Χ™Χ”Χ•Χ™ Χ‘Χ•Χ’ Χ”Χ¨Χ›Χ‘"""
 
249
  self.keyword_index[keyword].append(len(self.chunks) - 1)
250
 
251
  print(f"Created {len(self.chunks)} smart chunks from {len(raw_data)} articles with rich metadata.")
252
+ # Don't build index here - let it be lazy loaded on first search
253
 
254
  def _extract_car_type(self, title: str) -> str:
255
  """Χ–Χ™Χ”Χ•Χ™ Χ‘Χ•Χ’ Χ”Χ¨Χ›Χ‘"""
test_rag.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Simple test file for RAG Engine
4
+ Tests basic initialization and search functionality
5
+ """
6
+
7
+ import sys
8
+ import os
9
+
10
+ # Add project to path
11
+ sys.path.insert(0, os.path.dirname(__file__))
12
+
13
+ def test_initialization():
14
+ """Test RAG engine initialization"""
15
+ print("\n" + "="*60)
16
+ print("TEST 1: RAG Engine Initialization")
17
+ print("="*60)
18
+
19
+ from rag_engine import RAGEngine
20
+
21
+ try:
22
+ engine = RAGEngine()
23
+ print(f"βœ… Engine initialized successfully")
24
+ print(f" - Chunks loaded: {len(engine.chunks)}")
25
+ print(f" - Metadata entries: {len(engine.chunk_metadata)}")
26
+ print(f" - Keyword index entries: {len(engine.keyword_index)}")
27
+ print(f" - Embeddings: {engine.embeddings}")
28
+ return True, engine
29
+ except Exception as e:
30
+ print(f"❌ Initialization failed: {e}")
31
+ import traceback
32
+ traceback.print_exc()
33
+ return False, None
34
+
35
+
36
+ def test_search(engine):
37
+ """Test hybrid search functionality"""
38
+ print("\n" + "="*60)
39
+ print("TEST 2: Hybrid Search")
40
+ print("="*60)
41
+
42
+ try:
43
+ query = "Tell me about the Audi RS3"
44
+ print(f"Testing search for: '{query}'")
45
+
46
+ results = engine._hybrid_search(query, top_k=3)
47
+ print(f"βœ… Search successful")
48
+ print(f" - Results found: {len(results)}")
49
+
50
+ if results:
51
+ print(f" - Top result score: {results[0]['score']:.3f}")
52
+ print(f" - Top result title: {results[0]['metadata']['title']}")
53
+
54
+ return True
55
+ except Exception as e:
56
+ print(f"❌ Search failed: {e}")
57
+ import traceback
58
+ traceback.print_exc()
59
+ return False
60
+
61
+
62
+ def test_car_normalization(engine):
63
+ """Test car name normalization"""
64
+ print("\n" + "="*60)
65
+ print("TEST 3: Car Name Normalization")
66
+ print("="*60)
67
+
68
+ test_cases = [
69
+ ("Audi RS3", "audi_rs3"),
70
+ ("RS3", "audi_rs3"),
71
+ ("Χ§Χ™Χ” EV9", "kia_ev9"),
72
+ ("Citroen C3", "citroen_c3"),
73
+ ]
74
+
75
+ passed = 0
76
+ failed = 0
77
+
78
+ for text, expected in test_cases:
79
+ result = engine._normalize_car_name(text)
80
+ if result == expected:
81
+ print(f"βœ… '{text}' β†’ {result}")
82
+ passed += 1
83
+ else:
84
+ print(f"❌ '{text}' β†’ {result} (expected {expected})")
85
+ failed += 1
86
+
87
+ print(f" - Passed: {passed}/{len(test_cases)}")
88
+ return failed == 0
89
+
90
+
91
+ def test_embeddings(engine):
92
+ """Test that embeddings are lazy loaded"""
93
+ print("\n" + "="*60)
94
+ print("TEST 4: Lazy Embedding Loading")
95
+ print("="*60)
96
+
97
+ try:
98
+ # Check initial state
99
+ if engine.embeddings is None:
100
+ print("βœ… Embeddings are None at startup (lazy loading working)")
101
+ else:
102
+ print("⚠️ Embeddings already loaded (not lazy)")
103
+
104
+ # Trigger embedding generation
105
+ query = "Test query"
106
+ engine._hybrid_search(query, top_k=1)
107
+
108
+ if engine.embeddings is not None:
109
+ print(f"βœ… Embeddings generated after first search")
110
+ print(f" - Shape: {engine.embeddings.shape}")
111
+ print(f" - Expected chunks: {len(engine.chunks)}")
112
+ return True
113
+ else:
114
+ print(f"❌ Embeddings not generated")
115
+ return False
116
+
117
+ except Exception as e:
118
+ print(f"❌ Embedding test failed: {e}")
119
+ import traceback
120
+ traceback.print_exc()
121
+ return False
122
+
123
+
124
+ def main():
125
+ """Run all tests"""
126
+ print("\n" + "="*60)
127
+ print("CARSRUS RAG ENGINE TEST SUITE")
128
+ print("="*60)
129
+
130
+ # Test 1: Initialization
131
+ success, engine = test_initialization()
132
+ if not success:
133
+ print("\n❌ TESTS FAILED - Initialization error")
134
+ return 1
135
+
136
+ # Test 2: Normalization
137
+ if not test_car_normalization(engine):
138
+ print("\n⚠️ Some normalization tests failed")
139
+
140
+ # Test 3: Search
141
+ if not test_search(engine):
142
+ print("\n❌ TESTS FAILED - Search error")
143
+ return 1
144
+
145
+ # Test 4: Embeddings
146
+ if not test_embeddings(engine):
147
+ print("\n⚠️ Embedding test had issues")
148
+
149
+ # Summary
150
+ print("\n" + "="*60)
151
+ print("βœ… ALL CRITICAL TESTS PASSED")
152
+ print("="*60)
153
+ print("\nRAG Engine is ready for deployment!")
154
+ print("- Initialization: βœ…")
155
+ print("- Data loading: βœ…")
156
+ print("- Search functionality: βœ…")
157
+ print("- Lazy loading: βœ…")
158
+
159
+ return 0
160
+
161
+
162
+ if __name__ == "__main__":
163
+ exit(main())