mtyrrell commited on
Commit
d60bab3
·
1 Parent(s): d384965
Files changed (3) hide show
  1. utils/generator.py +1 -213
  2. utils/prompts.py +50 -0
  3. utils/sources.py +176 -0
utils/generator.py CHANGED
@@ -1,10 +1,5 @@
1
  import logging
2
- import asyncio
3
- import json
4
- import ast
5
- import re
6
  from typing import List, Dict, Any, Union, AsyncGenerator
7
- from dotenv import load_dotenv
8
 
9
  # LangChain imports
10
  from langchain_openai import ChatOpenAI
@@ -15,6 +10,7 @@ from langchain_core.messages import SystemMessage, HumanMessage
15
 
16
  # Local imports
17
  from .utils import getconfig, get_auth
 
18
 
19
  # Set up logger
20
  logger = logging.getLogger(__name__)
@@ -53,214 +49,6 @@ def _get_chat_model():
53
  # Initialize chat model
54
  chat_model = _get_chat_model()
55
 
56
- # ---------------------------------------------------------------------
57
- # Core Processing Functions
58
- # ---------------------------------------------------------------------
59
- def _parse_citations(response: str) -> List[int]:
60
- """Parse citation numbers from response text"""
61
- citation_pattern = r'\[(\d+)\]'
62
- matches = re.findall(citation_pattern, response)
63
- citation_numbers = sorted(list(set(int(match) for match in matches)))
64
-
65
- return citation_numbers
66
-
67
- def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
68
- """Extract sources that were cited in the response"""
69
- if not cited_numbers:
70
- return []
71
-
72
- cited_sources = []
73
- for citation_num in cited_numbers:
74
- source_index = citation_num - 1
75
-
76
- if 0 <= source_index < len(processed_results):
77
- source = processed_results[source_index].copy() # Make copy to avoid modifying original
78
- source['_citation_number'] = citation_num # Preserve original citation number
79
- cited_sources.append(source)
80
-
81
- return cited_sources
82
-
83
- def clean_citations(response: str) -> str:
84
- """Normalize all citation formats to [x] and remove unwanted sections"""
85
-
86
- # Remove References/Sources/Bibliography sections
87
- ref_patterns = [
88
- r'\n\s*#+\s*References?\s*:?.*$',
89
- r'\n\s*#+\s*Sources?\s*:?.*$',
90
- r'\n\s*#+\s*Bibliography\s*:?.*$',
91
- r'\n\s*References?\s*:.*$',
92
- r'\n\s*Sources?\s*:.*$',
93
- r'\n\s*Bibliography\s*:.*$',
94
- ]
95
- for pattern in ref_patterns:
96
- response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
97
-
98
- # Fix (Document X, Page Y, Year Z) -> [X]
99
- response = re.sub(
100
- r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
101
- r'[\1]',
102
- response,
103
- flags=re.IGNORECASE
104
- )
105
-
106
- # Fix [Document X, Page Y, Year Z] -> [X]
107
- response = re.sub(
108
- r'\[Document\s+(\d+)(?:[^\]]*)\]',
109
- r'[\1]',
110
- response,
111
- flags=re.IGNORECASE
112
- )
113
-
114
- # Fix [Document X: filename, Page Y, Year Z] -> [X]
115
- response = re.sub(
116
- r'\[Document\s+(\d+):[^\]]+\]',
117
- r'[\1]',
118
- response,
119
- flags=re.IGNORECASE
120
- )
121
-
122
- # Fix [X.Y.Z] style (section numbers) -> [X]
123
- response = re.sub(
124
- r'\[(\d+)\.[\d\.]+\]',
125
- r'[\1]',
126
- response
127
- )
128
-
129
- # Fix (Document X) -> [X]
130
- response = re.sub(
131
- r'\(Document\s+(\d+)\)',
132
- r'[\1]',
133
- response,
134
- flags=re.IGNORECASE
135
- )
136
-
137
- # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
138
- response = re.sub(
139
- r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
140
- r'[\1]',
141
- response,
142
- flags=re.IGNORECASE
143
- )
144
-
145
- # Fix "Document X states/says/mentions" -> [X]
146
- response = re.sub(
147
- r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
148
- r'[\1]',
149
- response,
150
- flags=re.IGNORECASE
151
- )
152
-
153
- # Clean up any double citations [[1]] -> [1]
154
- response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
155
-
156
- # Clean up multiple spaces
157
- response = re.sub(r'\s+', ' ', response)
158
-
159
- return response.strip()
160
-
161
- def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
162
- """Process context and return formatted context string and processed results"""
163
- processed_results = []
164
-
165
- if isinstance(context, list):
166
- if not context:
167
- raise ValueError("No retrieval results provided")
168
-
169
- # Extract relevant fields from retrieval results
170
- for result in context:
171
- if isinstance(result, str):
172
- result = ast.literal_eval(result)
173
-
174
- metadata = result.get('answer_metadata', {})
175
- doc_info = {
176
- 'answer': result.get('answer', ''),
177
- 'filename': metadata.get('filename', 'Unknown'),
178
- 'page': metadata.get('page', 'Unknown'),
179
- 'year': metadata.get('year', 'Unknown'),
180
- 'source': metadata.get('source', 'Unknown'),
181
- 'document_id': metadata.get('_id', 'Unknown')
182
- }
183
- processed_results.append(doc_info)
184
-
185
- # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
186
- context_parts = []
187
- for i, result in enumerate(processed_results, 1):
188
- # Simple format: [1], [2], etc.
189
- context_parts.append(f"[{i}]\n{result['answer']}\n")
190
-
191
- formatted_context = "\n".join(context_parts)
192
-
193
- elif isinstance(context, str):
194
- if not context.strip():
195
- raise ValueError("Context cannot be empty")
196
- formatted_context = context
197
- else:
198
- raise ValueError("Context must be either a string or list of retrieval results")
199
-
200
- return formatted_context, processed_results
201
-
202
- def _build_messages(question: str, context: str) -> list:
203
- """Build messages for LLM call"""
204
- system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
205
- You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
206
- Provide a clear and structured answer based on the passages/context provided and the guidelines.
207
-
208
- Guidelines:
209
- - If the passages have useful facts or numbers, use them in your answer.
210
- - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
211
- - If it makes sense, use bullet points and lists to make your answers easier to understand.
212
- - You do not need to use every passage. Only use the ones that help answer the question.
213
- - Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
214
- - Stay focused on the user's question. Do not add unrelated sections or topics.
215
-
216
- CRITICAL - CITATION FORMAT:
217
- Citations MUST be in this exact format: [1], [2], [3], etc.
218
- - ONLY the number in square brackets
219
- - Place at the end of relevant sentences
220
- - For multiple sources: [1][2]
221
-
222
- CORRECT:
223
- ✓ "The budget was UGX.284bn [2]."
224
- ✓ "Funding was approved by Parliament [1][3]."
225
-
226
- NEVER USE:
227
- ✗ [Document 1, Page 295, Year 2021]
228
- ✗ (Document 3, Page 23, 2021)
229
- ✗ Document 5, Page 295, 2021
230
- ✗ [2.2.2]
231
- ✗ "Document 5 states"
232
-
233
- DO NOT add a "References", "Sources", or "Bibliography" section at the end.
234
-
235
- FOLLOW-UP QUESTIONS:
236
- - If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
237
- - Base the question on information found in the context or natural extensions of the user's query.
238
- - Format: "You might also want to know:"
239
- - Keep it concise and directly related to the audit reports.
240
-
241
- If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
242
- """
243
- user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
244
- return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
245
-
246
- def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
247
- """Create sources list for ChatUI format"""
248
- sources = []
249
- for result in cited_sources:
250
- filename = result.get('filename', 'Unknown')
251
- page = result.get('page', 'Unknown')
252
- year = result.get('year', 'Unknown')
253
-
254
- link = f"doc://{filename}"
255
- title_parts = [filename]
256
- if page != 'Unknown':
257
- title_parts.append(f"Page {page}")
258
- if year != 'Unknown':
259
- title_parts.append(f"({year})")
260
-
261
- sources.append({"link": link, "title": " - ".join(title_parts)})
262
-
263
- return sources
264
 
265
  # ---------------------------------------------------------------------
266
  # LLM Call Functions
 
1
  import logging
 
 
 
 
2
  from typing import List, Dict, Any, Union, AsyncGenerator
 
3
 
4
  # LangChain imports
5
  from langchain_openai import ChatOpenAI
 
10
 
11
  # Local imports
12
  from .utils import getconfig, get_auth
13
+ from .sources import _process_context, _build_messages, _parse_citations, _extract_sources, _create_sources_list, clean_citations
14
 
15
  # Set up logger
16
  logger = logging.getLogger(__name__)
 
49
  # Initialize chat model
50
  chat_model = _get_chat_model()
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # ---------------------------------------------------------------------
54
  # LLM Call Functions
utils/prompts.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ system_prompt = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
2
+ You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
3
+ Provide a clear and structured answer based on the passages/context provided and the guidelines.
4
+
5
+ Guidelines:
6
+ - If the passages have useful facts or numbers, use them in your answer.
7
+ - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
8
+ - If it makes sense, use bullet points and lists to make your answers easier to understand.
9
+ - You do not need to use every passage. Only use the ones that help answer the question.
10
+ - Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
11
+ - Stay focused on the user's question. Do not add unrelated sections or topics.
12
+
13
+ CRITICAL - CITATION REQUIREMENTS:
14
+ EVERY factual statement, description, or claim MUST be cited. This includes:
15
+ - Numerical data and statistics
16
+ - Descriptions of what things are or how they work
17
+ - Background information about concepts, systems, or datasets
18
+ - Suggested applications or use cases based on context information
19
+ - ANY information derived from the passages
20
+
21
+ CRITICAL - CITATION FORMAT:
22
+ Citations MUST be in this exact format: [1], [2], [3], etc.
23
+ - ONLY the number in square brackets
24
+ - Place at the end of relevant sentences
25
+ - For multiple sources: [1][2]
26
+ - If an entire paragraph is based on one source, cite it at the end of the paragraph
27
+
28
+ CORRECT:
29
+ ✓ "The budget was UGX.284bn [2]."
30
+ ✓ "Funding was approved by Parliament [1][3]."
31
+ ✓ "The dataset is designed to bolster analytical capabilities [1]."
32
+
33
+
34
+ NEVER USE:
35
+ ✗ [Document 1, Page 295, Year 2021]
36
+ ✗ (Document 3, Page 23, 2021)
37
+ ✗ Document 5, Page 295, 2021
38
+ ✗ [2.2.2]
39
+ ✗ "Document 5 states"
40
+
41
+ DO NOT add a "References", "Sources", or "Bibliography" section at the end.
42
+
43
+ FOLLOW-UP QUESTIONS:
44
+ - If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
45
+ - Base the question on information found in the context or natural extensions of the user's query.
46
+ - Format: "You might also want to know:"
47
+ - Keep it concise and directly related to the audit reports.
48
+
49
+ If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
50
+ """
utils/sources.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict, Any, Union
3
+ import ast
4
+ from langchain_core.messages import SystemMessage, HumanMessage
5
+ from .prompts import system_prompt
6
+
7
+ # ---------------------------------------------------------------------
8
+ # Core Processing Functions
9
+ # ---------------------------------------------------------------------
10
+ def _parse_citations(response: str) -> List[int]:
11
+ """Parse citation numbers from response text"""
12
+ citation_pattern = r'\[(\d+)\]'
13
+ matches = re.findall(citation_pattern, response)
14
+ citation_numbers = sorted(list(set(int(match) for match in matches)))
15
+
16
+ return citation_numbers
17
+
18
+ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
19
+ """Extract sources that were cited in the response"""
20
+ if not cited_numbers:
21
+ return []
22
+
23
+ cited_sources = []
24
+ for citation_num in cited_numbers:
25
+ source_index = citation_num - 1
26
+
27
+ if 0 <= source_index < len(processed_results):
28
+ source = processed_results[source_index].copy() # Make copy to avoid modifying original
29
+ source['_citation_number'] = citation_num # Preserve original citation number
30
+ cited_sources.append(source)
31
+
32
+ return cited_sources
33
+
34
+ def clean_citations(response: str) -> str:
35
+ """Normalize all citation formats to [x] and remove unwanted sections"""
36
+
37
+ # Remove References/Sources/Bibliography sections
38
+ ref_patterns = [
39
+ r'\n\s*#+\s*References?\s*:?.*$',
40
+ r'\n\s*#+\s*Sources?\s*:?.*$',
41
+ r'\n\s*#+\s*Bibliography\s*:?.*$',
42
+ r'\n\s*References?\s*:.*$',
43
+ r'\n\s*Sources?\s*:.*$',
44
+ r'\n\s*Bibliography\s*:.*$',
45
+ ]
46
+ for pattern in ref_patterns:
47
+ response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
48
+
49
+ # Fix (Document X, Page Y, Year Z) -> [X]
50
+ response = re.sub(
51
+ r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
52
+ r'[\1]',
53
+ response,
54
+ flags=re.IGNORECASE
55
+ )
56
+
57
+ # Fix [Document X, Page Y, Year Z] -> [X]
58
+ response = re.sub(
59
+ r'\[Document\s+(\d+)(?:[^\]]*)\]',
60
+ r'[\1]',
61
+ response,
62
+ flags=re.IGNORECASE
63
+ )
64
+
65
+ # Fix [Document X: filename, Page Y, Year Z] -> [X]
66
+ response = re.sub(
67
+ r'\[Document\s+(\d+):[^\]]+\]',
68
+ r'[\1]',
69
+ response,
70
+ flags=re.IGNORECASE
71
+ )
72
+
73
+ # Fix [X.Y.Z] style (section numbers) -> [X]
74
+ response = re.sub(
75
+ r'\[(\d+)\.[\d\.]+\]',
76
+ r'[\1]',
77
+ response
78
+ )
79
+
80
+ # Fix (Document X) -> [X]
81
+ response = re.sub(
82
+ r'\(Document\s+(\d+)\)',
83
+ r'[\1]',
84
+ response,
85
+ flags=re.IGNORECASE
86
+ )
87
+
88
+ # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
89
+ response = re.sub(
90
+ r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
91
+ r'[\1]',
92
+ response,
93
+ flags=re.IGNORECASE
94
+ )
95
+
96
+ # Fix "Document X states/says/mentions" -> [X]
97
+ response = re.sub(
98
+ r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
99
+ r'[\1]',
100
+ response,
101
+ flags=re.IGNORECASE
102
+ )
103
+
104
+ # Clean up any double citations [[1]] -> [1]
105
+ response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
106
+
107
+ # Clean up multiple spaces
108
+ response = re.sub(r'\s+', ' ', response)
109
+
110
+ return response.strip()
111
+
112
+ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
113
+ """Process context and return formatted context string and processed results"""
114
+ processed_results = []
115
+
116
+ if isinstance(context, list):
117
+ if not context:
118
+ raise ValueError("No retrieval results provided")
119
+
120
+ # Extract relevant fields from retrieval results
121
+ for result in context:
122
+ if isinstance(result, str):
123
+ result = ast.literal_eval(result)
124
+
125
+ metadata = result.get('answer_metadata', {})
126
+ doc_info = {
127
+ 'answer': result.get('answer', ''),
128
+ 'filename': metadata.get('filename', 'Unknown'),
129
+ 'page': metadata.get('page', 'Unknown'),
130
+ 'year': metadata.get('year', 'Unknown'),
131
+ 'source': metadata.get('source', 'Unknown'),
132
+ 'document_id': metadata.get('_id', 'Unknown')
133
+ }
134
+ processed_results.append(doc_info)
135
+
136
+ # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
137
+ context_parts = []
138
+ for i, result in enumerate(processed_results, 1):
139
+ # Simple format: [1], [2], etc.
140
+ context_parts.append(f"[{i}]\n{result['answer']}\n")
141
+
142
+ formatted_context = "\n".join(context_parts)
143
+
144
+ elif isinstance(context, str):
145
+ if not context.strip():
146
+ raise ValueError("Context cannot be empty")
147
+ formatted_context = context
148
+ else:
149
+ raise ValueError("Context must be either a string or list of retrieval results")
150
+
151
+ return formatted_context, processed_results
152
+
153
+ def _build_messages(system_prompt: str, question: str, context: str) -> list:
154
+ """Build messages for LLM call"""
155
+ system_content = system_prompt
156
+ user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
157
+ return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
158
+
159
+ def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
160
+ """Create sources list for ChatUI format"""
161
+ sources = []
162
+ for result in cited_sources:
163
+ filename = result.get('filename', 'Unknown')
164
+ page = result.get('page', 'Unknown')
165
+ year = result.get('year', 'Unknown')
166
+
167
+ link = f"doc://{filename}"
168
+ title_parts = [filename]
169
+ if page != 'Unknown':
170
+ title_parts.append(f"Page {page}")
171
+ if year != 'Unknown':
172
+ title_parts.append(f"({year})")
173
+
174
+ sources.append({"link": link, "title": " - ".join(title_parts)})
175
+
176
+ return sources