import re import logging logger = logging.getLogger(__name__) from typing import List, Dict, Any, Optional, Tuple from langchain_core.documents import Document # --------------------------------------------------------------------- # Core Processing Functions # --------------------------------------------------------------------- def parse_citations(response: str) -> List[int]: """Parse citation numbers from response text, handling both [1] and [1,2,3] formats""" # Match both single citations [1] and comma-separated citations [1,2,3,4,5] citation_pattern = r'\[([\d,\s]+)\]' matches = re.findall(citation_pattern, response) citation_numbers = set() for match in matches: # Split by comma and extract all numbers numbers = re.findall(r'\d+', match) citation_numbers.update(int(num) for num in numbers) citation_numbers = sorted(list(citation_numbers)) logger.debug(f"Probable Citations found: {citation_numbers}") return citation_numbers def extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]: """Extract sources that were cited in the response""" if not cited_numbers: return [] cited_sources = [] for citation_num in cited_numbers: source_index = citation_num - 1 if 0 <= source_index < len(processed_results): source = processed_results[source_index].copy() # Make copy to avoid modifying original source['_citation_number'] = citation_num # Preserve original citation number cited_sources.append(source) #logger.debug(f"Extracted citations : {cited_sources}") return cited_sources def clean_citations(response: str) -> str: """Normalize all citation formats to [x] and remove unwanted sections""" # Remove References/Sources/Bibliography sections ref_patterns = [ r'\n\s*#+\s*References?\s*:?.*$', r'\n\s*#+\s*Sources?\s*:?.*$', r'\n\s*#+\s*Bibliography\s*:?.*$', r'\n\s*References?\s*:.*$', r'\n\s*Sources?\s*:.*$', r'\n\s*Bibliography\s*:.*$', ] for pattern in ref_patterns: response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL) # Fix (Document X, Page Y, Year Z) -> [X] response = re.sub( r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)', r'[\1]', response, flags=re.IGNORECASE ) # Fix [Document X, Page Y, Year Z] -> [X] response = re.sub( r'\[Document\s+(\d+)(?:[^\]]*)\]', r'[\1]', response, flags=re.IGNORECASE ) # Fix [Document X: filename, Page Y, Year Z] -> [X] response = re.sub( r'\[Document\s+(\d+):[^\]]+\]', r'[\1]', response, flags=re.IGNORECASE ) # Fix [X.Y.Z] style (section numbers) -> [X] response = re.sub( r'\[(\d+)\.[\d\.]+\]', r'[\1]', response ) # Fix (Document X) -> [X] response = re.sub( r'\(Document\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE ) # Fix "Document X, Page Y, Year Z" (no brackets) -> [X] response = re.sub( r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])', r'[\1]', response, flags=re.IGNORECASE ) # Fix "Document X states/says/mentions" -> [X] response = re.sub( r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)', r'[\1]', response, flags=re.IGNORECASE ) # Clean up any double citations [[1]] -> [1] response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response) # Clean up multiple spaces response = re.sub(r'\s+', ' ', response) return response.strip() def process_context( context: List[Document], metadata_fields_to_include: Optional[List[str]] = None ) -> Tuple[str, List[Dict[str, Any]]]: """ Processes LangChain Documents, extracts content and selected metadata, and returns the formatted context string and the processed list of results. Args: context: A list of LangChain Document objects from the retriever. metadata_fields_to_include: Optional list of metadata keys (e.g., ['source', 'page']) to include in the formatted context string sent to the LLM. Returns: A tuple: (formatted_context_string, processed_results_list) """ logger.debug(f"Context Processing: \n Context: {context} \n Metadata_fileds: {metadata_fields_to_include}") logger.info(f"Context Processing with Metadata_fileds_to_include: {metadata_fields_to_include}") # 1. Input Validation if not isinstance(context, list) or not all(isinstance(doc, Document) for doc in context): # Raise a specific error if input is not what's expected raise ValueError("Context must be a list of LangChain Document objects.") if not context: return "", [] processed_results = [] metadata_fields_to_include = metadata_fields_to_include or [] # 2. Standardize Structure and Build Context String context_parts = [] for i, doc in enumerate(context, 1): # The primary dictionary that holds all info for this document doc_info = { 'answer': doc.page_content, '__all_metadata__': doc.metadata, # Store all metadata for citation linking later '_citation_number_key': i # Store the citation number } #logger.debug(f"DocInfo of {i} context: {doc_info}") # Extract selected metadata fields for the prompt string metadata_str_parts = [] for field in metadata_fields_to_include: value = doc.metadata.get(field) if value is not None: # Store the value in the doc_info dict and format for the prompt string doc_info[field] = value # Format the field for readability in the prompt field_name = field.replace('_', ' ').title() metadata_str_parts.append(f"{field_name}: {value}") # Build the document string if metadata_str_parts: metadata_line = " | ".join(metadata_str_parts) # Example output: [1] (Type: decision, Meeting ID: 123) context_str = f"[{i}] **This is Metadata** \n ({metadata_line})\n **Contextual Text** \n {doc.page_content}" else: context_str = f"[{i}]\n{doc.page_content}" logger.debug(f" Updated Context {i}: {context_str}") context_parts.append(context_str) processed_results.append(doc_info) # Collect the standardized dict for later use formatted_context = "\n---\n".join(context_parts) return formatted_context, processed_results def create_sources_list( cited_sources: List[Dict[str, Any]], title_metadata_fields: List[str], link_metadata_field: str ) -> List[Dict[str, str]]: """ Create sources list for ChatUI format using configuration for title and link fields. Args: cited_sources: List of standardized dictionaries that were cited. title_metadata_fields: List of metadata keys (e.g., ['document_type', 'decision_number']) to use to build the source title. link_metadata_field: The single metadata key (e.g., 'document_url') to use for the source link (URL). """ sources = [] logger.info("creating sources list for ChatUI") logger.debug(f"Raw Cited sources: {cited_sources}") for result in cited_sources: # We access the original, full metadata dictionary all_meta = result.get('__all_metadata__', {}) citation_num = result.get('_citation_number', 'N/A') # 1. Build Title using configured fields title_parts = [] for field in title_metadata_fields: value = all_meta.get(field) if value is not None: title_parts.append(str(value)) # Create a descriptive title title = " - ".join(title_parts) if title_parts else f"Source {citation_num}" # 2. Extract Link using configured field (use 'doc://#' as fallback for empty/missing) # ChatUI requires URLs to match doc://, http://, or https:// schemes link = all_meta.get(link_metadata_field) if not link or link == '#': link = 'doc://#' # Use doc:// scheme for placeholder links sources.append({ "uri": link, "title": title }) logger.debug(f"formatted cited sources :{sources}") return sources