Courtney Ford commited on
Commit
dcf70e8
·
1 Parent(s): de20ac0

updates based on feedback

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app_new.py CHANGED
@@ -37,10 +37,10 @@ class EnhancedRAGSystem:
37
  self.demo_mode = True
38
  return
39
 
40
- # Load embedding model
41
  print("Loading embedding model...")
42
  self.embedding_model = HuggingFaceEmbeddings(
43
- model_name="sentence-transformers/all-MiniLM-L6-v2",
44
  model_kwargs={"device": "cpu"},
45
  encode_kwargs={"normalize_embeddings": True},
46
  )
 
37
  self.demo_mode = True
38
  return
39
 
40
+ # Load embedding model - UPDATED TO MATCH NEW MODEL
41
  print("Loading embedding model...")
42
  self.embedding_model = HuggingFaceEmbeddings(
43
+ model_name="sentence-transformers/all-mpnet-base-v2", # CHANGED
44
  model_kwargs={"device": "cpu"},
45
  encode_kwargs={"normalize_embeddings": True},
46
  )
rag_query.py CHANGED
@@ -11,14 +11,28 @@ def format_context_with_citations(results: List[Tuple[Document, float]]) -> str:
11
  for i, (doc, score) in enumerate(results, 1):
12
  citation = doc.metadata.get("citation", "Unknown Source")
13
  entity = doc.metadata.get("entity", "Unknown")
 
 
14
  text = doc.page_content
15
 
16
- context_parts.append(
17
- f"[Source {i}]\n"
18
- f"Citation: {citation}\n"
19
- f"Jurisdiction: {entity}\n"
20
- f"Content: {text}\n"
21
- )
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  return "\n---\n".join(context_parts)
24
 
@@ -93,40 +107,53 @@ def rerank_by_document_priority(
93
  results: List[Tuple[Document, float]], boost_factor: float = 0.3
94
  ) -> List[Tuple[Document, float]]:
95
  """
96
- Rerank results to prioritise passed legislation over white papers.
97
-
98
- Priority order:
99
- 1. Passed Legislation (highest priority)
100
- 2. Draft Legislation (medium priority)
101
- 3. White Papers/Reports (baseline)
102
-
103
- Args:
104
- results: List of (Document, score) tuples from vectorstore
105
- boost_factor: How much to boost priority documents (0.3 = 30% score reduction)
106
-
107
- Returns:
108
- Reranked list of (Document, score) tuples
109
  """
110
  reranked = []
111
 
112
  for doc, score in results:
113
  status_raw = doc.metadata.get("status", "")
114
  status = str(status_raw).lower()
115
- doc_type = str(doc.metadata.get("document_type", ""))
116
-
117
- if "passed" in status or "enacted" in status:
118
- boosted_score = score * (1 - boost_factor * 2)
119
- elif "draft" in status or "bill" in status:
120
- boosted_score = score * (1 - boost_factor)
121
- elif doc_type in ["Article_style", "US_Congress", "Special_cases"]:
122
- boosted_score = score * (1 - boost_factor * 0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  else:
124
  boosted_score = score
125
 
126
  reranked.append((doc, boosted_score))
127
 
 
128
  reranked.sort(key=lambda x: x[1])
129
 
 
130
  return [
131
  (doc, original_score)
132
  for (doc, _), (_, original_score) in zip(reranked, results)
@@ -220,6 +247,33 @@ def extract_document_references(
220
  return list(set(matching_files)), suggested_entity
221
 
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  def ask_question_with_llm(
224
  vectorstore,
225
  question: str,
@@ -244,6 +298,11 @@ def ask_question_with_llm(
244
  Returns:
245
  Dictionary with answer, sources, and metadata
246
  """
 
 
 
 
 
247
  # Check if question references specific documents
248
  referenced_docs = []
249
  detected_entity = None
@@ -341,6 +400,9 @@ def ask_question_with_llm(
341
 
342
  results = boosted_results + other_results
343
 
 
 
 
344
  results = results[:k]
345
 
346
  if not results:
 
11
  for i, (doc, score) in enumerate(results, 1):
12
  citation = doc.metadata.get("citation", "Unknown Source")
13
  entity = doc.metadata.get("entity", "Unknown")
14
+ language = doc.metadata.get("language", "")
15
+ status = doc.metadata.get("status", "")
16
  text = doc.page_content
17
 
18
+ # Build the source block
19
+ source_block = [
20
+ f"[Source {i}]",
21
+ f"Citation: {citation}",
22
+ f"Jurisdiction: {entity}",
23
+ ]
24
+
25
+ if status and status.lower() not in ["published", ""]:
26
+ source_block.append(f"Status: {status}")
27
+
28
+ if language and language.lower() not in ["english", ""]:
29
+ source_block.append(
30
+ f"Language: {language} translation - interpret with caution"
31
+ )
32
+
33
+ source_block.append(f"Content: {text}")
34
+
35
+ context_parts.append("\n".join(source_block))
36
 
37
  return "\n---\n".join(context_parts)
38
 
 
107
  results: List[Tuple[Document, float]], boost_factor: float = 0.3
108
  ) -> List[Tuple[Document, float]]:
109
  """
110
+ Rerank results to prioritize:
111
+ 1. Primary legislation (highest priority)
112
+ 2. Draft legislation (medium priority)
113
+ 3. Articles over preambles
114
+ 4. White Papers/Reports (lowest priority)
 
 
 
 
 
 
 
 
115
  """
116
  reranked = []
117
 
118
  for doc, score in results:
119
  status_raw = doc.metadata.get("status", "")
120
  status = str(status_raw).lower()
121
+ doc_type = doc.metadata.get("document_type", "")
122
+ filename = doc.metadata.get("filename", "")
123
+
124
+ # Highest priority: Passed/enacted legislation in Article/Section format
125
+ if ("passed" in status or "enacted" in status) and doc_type in [
126
+ "Article_style",
127
+ "US_Congress",
128
+ "Special_cases",
129
+ ]:
130
+ boosted_score = score * (1 - boost_factor * 3) # Strong boost
131
+
132
+ # Deprioritize preambles
133
+ elif "preamble" in filename.lower():
134
+ boosted_score = score * (1 + boost_factor * 2) # Penalty
135
+
136
+ # Medium priority: Draft legislation or other structured docs
137
+ elif "draft" in status or doc_type in [
138
+ "Article_style",
139
+ "US_Congress",
140
+ "Special_cases",
141
+ ]:
142
+ boosted_score = score * (1 - boost_factor * 1.5)
143
+
144
+ # Low priority: White papers and reports (Paragraph_style)
145
+ elif doc_type == "Paragraph_style":
146
+ boosted_score = score * (1 + boost_factor) # Slight penalty
147
+
148
  else:
149
  boosted_score = score
150
 
151
  reranked.append((doc, boosted_score))
152
 
153
+ # Sort by boosted score (lower is better in FAISS)
154
  reranked.sort(key=lambda x: x[1])
155
 
156
+ # Return with ORIGINAL scores for transparency
157
  return [
158
  (doc, original_score)
159
  for (doc, _), (_, original_score) in zip(reranked, results)
 
247
  return list(set(matching_files)), suggested_entity
248
 
249
 
250
+ def is_comparison_question(question: str) -> bool:
251
+ """Detect if question is comparing multiple jurisdictions"""
252
+ question_lower = question.lower()
253
+
254
+ comparison_patterns = [
255
+ "differ from",
256
+ "compared to",
257
+ "versus",
258
+ "vs",
259
+ "vs.",
260
+ "difference between",
261
+ "differences between",
262
+ "compare",
263
+ "comparison",
264
+ "contrast",
265
+ "how does",
266
+ "what does",
267
+ "unlike",
268
+ "similar to",
269
+ "different from",
270
+ "in contrast to",
271
+ "as opposed to",
272
+ ]
273
+
274
+ return any(pattern in question_lower for pattern in comparison_patterns)
275
+
276
+
277
  def ask_question_with_llm(
278
  vectorstore,
279
  question: str,
 
298
  Returns:
299
  Dictionary with answer, sources, and metadata
300
  """
301
+ # If it's a comparison question, disable auto entity detection
302
+ if is_comparison_question(question):
303
+ auto_detect_entity = False
304
+ print("Comparison question detected - retrieving from all jurisdictions")
305
+
306
  # Check if question references specific documents
307
  referenced_docs = []
308
  detected_entity = None
 
400
 
401
  results = boosted_results + other_results
402
 
403
+ # RERANK BY DOCUMENT PRIORITY - prioritize primary legislation
404
+ results = rerank_by_document_priority(results, boost_factor=0.3)
405
+
406
  results = results[:k]
407
 
408
  if not results:
vectorstore/.DS_Store ADDED
Binary file (6.15 kB). View file
 
vectorstore/index.faiss CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efb63120469d0a2b5af3345fdf5decfb9ab55ad7fb32b8b90fcf0df3fab8c652
3
- size 59520045
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb883f59d3927a716c42023d82c9b39ba5edde321942c09a8909e06c3b2ea52d
3
+ size 119040045
vectorstore/index.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffd9eb630f2f73dda4e0581806087bc483cb471c37101ebb8ede6f3c0c3506f2
3
- size 43207761
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6002066d961d87767b0dbad5ebbc2c5bfcd59f0b088f1250874bb789a7de9c45
3
+ size 43207710