Spaces:
Sleeping
Sleeping
| import re | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| from typing import List, Dict, Any, Optional, Tuple | |
| from langchain_core.documents import Document | |
| # --------------------------------------------------------------------- | |
| # Core Processing Functions | |
| # --------------------------------------------------------------------- | |
| def parse_citations(response: str) -> List[int]: | |
| """Parse citation numbers from response text, handling both [1] and [1,2,3] formats""" | |
| # Match both single citations [1] and comma-separated citations [1,2,3,4,5] | |
| citation_pattern = r'\[([\d,\s]+)\]' | |
| matches = re.findall(citation_pattern, response) | |
| citation_numbers = set() | |
| for match in matches: | |
| # Split by comma and extract all numbers | |
| numbers = re.findall(r'\d+', match) | |
| citation_numbers.update(int(num) for num in numbers) | |
| citation_numbers = sorted(list(citation_numbers)) | |
| logger.debug(f"Probable Citations found: {citation_numbers}") | |
| return citation_numbers | |
| def extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]: | |
| """Extract sources that were cited in the response""" | |
| if not cited_numbers: | |
| return [] | |
| cited_sources = [] | |
| for citation_num in cited_numbers: | |
| source_index = citation_num - 1 | |
| if 0 <= source_index < len(processed_results): | |
| source = processed_results[source_index].copy() # Make copy to avoid modifying original | |
| source['_citation_number'] = citation_num # Preserve original citation number | |
| cited_sources.append(source) | |
| #logger.debug(f"Extracted citations : {cited_sources}") | |
| return cited_sources | |
| def clean_citations(response: str) -> str: | |
| """Normalize all citation formats to [x] and remove unwanted sections""" | |
| # Remove References/Sources/Bibliography sections | |
| ref_patterns = [ | |
| r'\n\s*#+\s*References?\s*:?.*$', | |
| r'\n\s*#+\s*Sources?\s*:?.*$', | |
| r'\n\s*#+\s*Bibliography\s*:?.*$', | |
| r'\n\s*References?\s*:.*$', | |
| r'\n\s*Sources?\s*:.*$', | |
| r'\n\s*Bibliography\s*:.*$', | |
| ] | |
| for pattern in ref_patterns: | |
| response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL) | |
| # Fix (Document X, Page Y, Year Z) -> [X] | |
| response = re.sub( | |
| r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)', | |
| r'[\1]', | |
| response, | |
| flags=re.IGNORECASE | |
| ) | |
| # Fix [Document X, Page Y, Year Z] -> [X] | |
| response = re.sub( | |
| r'\[Document\s+(\d+)(?:[^\]]*)\]', | |
| r'[\1]', | |
| response, | |
| flags=re.IGNORECASE | |
| ) | |
| # Fix [Document X: filename, Page Y, Year Z] -> [X] | |
| response = re.sub( | |
| r'\[Document\s+(\d+):[^\]]+\]', | |
| r'[\1]', | |
| response, | |
| flags=re.IGNORECASE | |
| ) | |
| # Fix [X.Y.Z] style (section numbers) -> [X] | |
| response = re.sub( | |
| r'\[(\d+)\.[\d\.]+\]', | |
| r'[\1]', | |
| response | |
| ) | |
| # Fix (Document X) -> [X] | |
| response = re.sub( | |
| r'\(Document\s+(\d+)\)', | |
| r'[\1]', | |
| response, | |
| flags=re.IGNORECASE | |
| ) | |
| # Fix "Document X, Page Y, Year Z" (no brackets) -> [X] | |
| response = re.sub( | |
| r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])', | |
| r'[\1]', | |
| response, | |
| flags=re.IGNORECASE | |
| ) | |
| # Fix "Document X states/says/mentions" -> [X] | |
| response = re.sub( | |
| r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)', | |
| r'[\1]', | |
| response, | |
| flags=re.IGNORECASE | |
| ) | |
| # Clean up any double citations [[1]] -> [1] | |
| response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response) | |
| # Clean up multiple spaces | |
| response = re.sub(r'\s+', ' ', response) | |
| return response.strip() | |
| def process_context( | |
| context: List[Document], | |
| metadata_fields_to_include: Optional[List[str]] = None | |
| ) -> Tuple[str, List[Dict[str, Any]]]: | |
| """ | |
| Processes LangChain Documents, extracts content and selected metadata, | |
| and returns the formatted context string and the processed list of results. | |
| Args: | |
| context: A list of LangChain Document objects from the retriever. | |
| metadata_fields_to_include: Optional list of metadata keys (e.g., ['source', 'page']) | |
| to include in the formatted context string sent to the LLM. | |
| Returns: | |
| A tuple: (formatted_context_string, processed_results_list) | |
| """ | |
| logger.debug(f"Context Processing: \n Context: {context} \n Metadata_fileds: {metadata_fields_to_include}") | |
| logger.info(f"Context Processing with Metadata_fileds_to_include: {metadata_fields_to_include}") | |
| # 1. Input Validation | |
| if not isinstance(context, list) or not all(isinstance(doc, Document) for doc in context): | |
| # Raise a specific error if input is not what's expected | |
| raise ValueError("Context must be a list of LangChain Document objects.") | |
| if not context: | |
| return "", [] | |
| processed_results = [] | |
| metadata_fields_to_include = metadata_fields_to_include or [] | |
| # 2. Standardize Structure and Build Context String | |
| context_parts = [] | |
| for i, doc in enumerate(context, 1): | |
| # The primary dictionary that holds all info for this document | |
| doc_info = { | |
| 'answer': doc.page_content, | |
| '__all_metadata__': doc.metadata, # Store all metadata for citation linking later | |
| '_citation_number_key': i # Store the citation number | |
| } | |
| #logger.debug(f"DocInfo of {i} context: {doc_info}") | |
| # Extract selected metadata fields for the prompt string | |
| metadata_str_parts = [] | |
| for field in metadata_fields_to_include: | |
| value = doc.metadata.get(field) | |
| if value is not None: | |
| # Store the value in the doc_info dict and format for the prompt string | |
| doc_info[field] = value | |
| # Format the field for readability in the prompt | |
| field_name = field.replace('_', ' ').title() | |
| metadata_str_parts.append(f"{field_name}: {value}") | |
| # Build the document string | |
| if metadata_str_parts: | |
| metadata_line = " | ".join(metadata_str_parts) | |
| # Example output: [1] (Type: decision, Meeting ID: 123) | |
| context_str = f"[{i}] **This is Metadata** \n ({metadata_line})\n **Contextual Text** \n {doc.page_content}" | |
| else: | |
| context_str = f"[{i}]\n{doc.page_content}" | |
| logger.debug(f" Updated Context {i}: {context_str}") | |
| context_parts.append(context_str) | |
| processed_results.append(doc_info) # Collect the standardized dict for later use | |
| formatted_context = "\n---\n".join(context_parts) | |
| return formatted_context, processed_results | |
| def create_sources_list( | |
| cited_sources: List[Dict[str, Any]], | |
| title_metadata_fields: List[str], | |
| link_metadata_field: str | |
| ) -> List[Dict[str, str]]: | |
| """ | |
| Create sources list for ChatUI format using configuration for title and link fields. | |
| Args: | |
| cited_sources: List of standardized dictionaries that were cited. | |
| title_metadata_fields: List of metadata keys (e.g., ['document_type', 'decision_number']) | |
| to use to build the source title. | |
| link_metadata_field: The single metadata key (e.g., 'document_url') to use for the source link (URL). | |
| """ | |
| sources = [] | |
| logger.info("creating sources list for ChatUI") | |
| logger.debug(f"Raw Cited sources: {cited_sources}") | |
| for result in cited_sources: | |
| # We access the original, full metadata dictionary | |
| all_meta = result.get('__all_metadata__', {}) | |
| citation_num = result.get('_citation_number', 'N/A') | |
| # 1. Build Title using configured fields | |
| title_parts = [] | |
| for field in title_metadata_fields: | |
| value = all_meta.get(field) | |
| if value is not None: | |
| title_parts.append(str(value)) | |
| # Create a descriptive title | |
| title = " - ".join(title_parts) if title_parts else f"Source {citation_num}" | |
| # 2. Extract Link using configured field (use 'doc://#' as fallback for empty/missing) | |
| # ChatUI requires URLs to match doc://, http://, or https:// schemes | |
| link = all_meta.get(link_metadata_field) | |
| if not link or link == '#': | |
| link = 'doc://#' # Use doc:// scheme for placeholder links | |
| sources.append({ | |
| "uri": link, | |
| "title": title | |
| }) | |
| logger.debug(f"formatted cited sources :{sources}") | |
| return sources |