File size: 8,836 Bytes
3194955
 
 
f32f200
3194955
 
 
 
 
 
 
f32f200
 
 
3194955
f32f200
 
 
 
 
 
 
 
3194955
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6eb6385
 
 
 
 
 
3194955
 
4db8949
3194955
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import re
import logging
logger = logging.getLogger(__name__)
from typing import List, Dict, Any, Optional, Tuple
from langchain_core.documents import Document


# ---------------------------------------------------------------------
# Core Processing Functions
# ---------------------------------------------------------------------
def parse_citations(response: str) -> List[int]:
    """Parse citation numbers from response text, handling both [1] and [1,2,3] formats"""
    # Match both single citations [1] and comma-separated citations [1,2,3,4,5]
    citation_pattern = r'\[([\d,\s]+)\]'
    matches = re.findall(citation_pattern, response)

    citation_numbers = set()
    for match in matches:
        # Split by comma and extract all numbers
        numbers = re.findall(r'\d+', match)
        citation_numbers.update(int(num) for num in numbers)

    citation_numbers = sorted(list(citation_numbers))
    logger.debug(f"Probable Citations found: {citation_numbers}")
    return citation_numbers

def extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
    """Extract sources that were cited in the response"""
    if not cited_numbers:
        return []
    
    cited_sources = []
    for citation_num in cited_numbers:
        source_index = citation_num - 1
        
        if 0 <= source_index < len(processed_results):
            source = processed_results[source_index].copy()  # Make copy to avoid modifying original
            source['_citation_number'] = citation_num  # Preserve original citation number
            cited_sources.append(source)
    #logger.debug(f"Extracted citations : {cited_sources}")
    
    return cited_sources

def clean_citations(response: str) -> str:
    """Normalize all citation formats to [x] and remove unwanted sections"""
    
    # Remove References/Sources/Bibliography sections
    ref_patterns = [
        r'\n\s*#+\s*References?\s*:?.*$',
        r'\n\s*#+\s*Sources?\s*:?.*$',
        r'\n\s*#+\s*Bibliography\s*:?.*$',
        r'\n\s*References?\s*:.*$',
        r'\n\s*Sources?\s*:.*$',
        r'\n\s*Bibliography\s*:.*$',
    ]
    for pattern in ref_patterns:
        response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
    
    # Fix (Document X, Page Y, Year Z) -> [X]
    response = re.sub(
        r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Fix [Document X, Page Y, Year Z] -> [X]
    response = re.sub(
        r'\[Document\s+(\d+)(?:[^\]]*)\]', 
        r'[\1]', 
        response, 
        flags=re.IGNORECASE
    )
    
    # Fix [Document X: filename, Page Y, Year Z] -> [X]
    response = re.sub(
        r'\[Document\s+(\d+):[^\]]+\]',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Fix [X.Y.Z] style (section numbers) -> [X]
    response = re.sub(
        r'\[(\d+)\.[\d\.]+\]', 
        r'[\1]', 
        response
    )
    
    # Fix (Document X) -> [X]
    response = re.sub(
        r'\(Document\s+(\d+)\)', 
        r'[\1]', 
        response, 
        flags=re.IGNORECASE
    )
    
    # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
    response = re.sub(
        r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Fix "Document X states/says/mentions" -> [X]
    response = re.sub(
        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
        r'[\1]',
        response,
        flags=re.IGNORECASE
    )
    
    # Clean up any double citations [[1]] -> [1]
    response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
    
    # Clean up multiple spaces
    response = re.sub(r'\s+', ' ', response)
    
    return response.strip()

def process_context(
    context: List[Document],
    metadata_fields_to_include: Optional[List[str]] = None
    ) -> Tuple[str, List[Dict[str, Any]]]:
    """
    Processes LangChain Documents, extracts content and selected metadata,
    and returns the formatted context string and the processed list of results.
    
    Args:
        context: A list of LangChain Document objects from the retriever.
        metadata_fields_to_include: Optional list of metadata keys (e.g., ['source', 'page']) 
                                    to include in the formatted context string sent to the LLM.
                                    
    Returns:
        A tuple: (formatted_context_string, processed_results_list)
    """
    logger.debug(f"Context Processing: \n Context: {context} \n Metadata_fileds: {metadata_fields_to_include}")
    logger.info(f"Context Processing with Metadata_fileds_to_include: {metadata_fields_to_include}")
    
    # 1. Input Validation
    if not isinstance(context, list) or not all(isinstance(doc, Document) for doc in context):
        # Raise a specific error if input is not what's expected
        raise ValueError("Context must be a list of LangChain Document objects.")
    if not context:
        return "", []

    processed_results = []
    metadata_fields_to_include = metadata_fields_to_include or []

    # 2. Standardize Structure and Build Context String
    context_parts = []
    
    for i, doc in enumerate(context, 1):
        # The primary dictionary that holds all info for this document
        doc_info = {
            'answer': doc.page_content,
            '__all_metadata__': doc.metadata, # Store all metadata for citation linking later
            '_citation_number_key': i        # Store the citation number
        }
        #logger.debug(f"DocInfo of {i} context: {doc_info}")
        
        # Extract selected metadata fields for the prompt string
        metadata_str_parts = []
        for field in metadata_fields_to_include:
            value = doc.metadata.get(field)
            if value is not None:
                # Store the value in the doc_info dict and format for the prompt string
                doc_info[field] = value
                
                # Format the field for readability in the prompt
                field_name = field.replace('_', ' ').title()
                metadata_str_parts.append(f"{field_name}: {value}")
        
        # Build the document string
        if metadata_str_parts:
            metadata_line = " | ".join(metadata_str_parts)
            # Example output: [1] (Type: decision, Meeting ID: 123)
            context_str = f"[{i}] **This is Metadata** \n ({metadata_line})\n **Contextual Text** \n {doc.page_content}"
        else:
            context_str = f"[{i}]\n{doc.page_content}"
        
        logger.debug(f" Updated Context {i}: {context_str}")
        context_parts.append(context_str)
        processed_results.append(doc_info) # Collect the standardized dict for later use
    formatted_context = "\n---\n".join(context_parts)
    
    return formatted_context, processed_results

def create_sources_list(
    cited_sources: List[Dict[str, Any]],
    title_metadata_fields: List[str],
    link_metadata_field: str
    ) -> List[Dict[str, str]]:
    """
    Create sources list for ChatUI format using configuration for title and link fields.

    Args:
        cited_sources: List of standardized dictionaries that were cited.
        title_metadata_fields: List of metadata keys (e.g., ['document_type', 'decision_number']) 
                               to use to build the source title.
        link_metadata_field: The single metadata key (e.g., 'document_url') to use for the source link (URL).
    """
    sources = []
    logger.info("creating sources list for ChatUI")
    logger.debug(f"Raw Cited sources: {cited_sources}")

    for result in cited_sources:
        # We access the original, full metadata dictionary
        all_meta = result.get('__all_metadata__', {})
        citation_num = result.get('_citation_number', 'N/A')
        
        # 1. Build Title using configured fields
        title_parts = []
        for field in title_metadata_fields:
            value = all_meta.get(field)
            if value is not None:
                title_parts.append(str(value))
        
        # Create a descriptive title
        title = " - ".join(title_parts) if title_parts else f"Source {citation_num}"

        # 2. Extract Link using configured field (use 'doc://#' as fallback for empty/missing)
        # ChatUI requires URLs to match doc://, http://, or https:// schemes
        link = all_meta.get(link_metadata_field)
        if not link or link == '#':
            link = 'doc://#'  # Use doc:// scheme for placeholder links

        sources.append({
            "uri": link,
            "title": title
        })
    
    logger.debug(f"formatted cited sources :{sources}")

    return sources