Spaces:
Sleeping
Sleeping
File size: 8,836 Bytes
3194955 f32f200 3194955 f32f200 3194955 f32f200 3194955 6eb6385 3194955 4db8949 3194955 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import re
import logging
logger = logging.getLogger(__name__)
from typing import List, Dict, Any, Optional, Tuple
from langchain_core.documents import Document
# ---------------------------------------------------------------------
# Core Processing Functions
# ---------------------------------------------------------------------
def parse_citations(response: str) -> List[int]:
"""Parse citation numbers from response text, handling both [1] and [1,2,3] formats"""
# Match both single citations [1] and comma-separated citations [1,2,3,4,5]
citation_pattern = r'\[([\d,\s]+)\]'
matches = re.findall(citation_pattern, response)
citation_numbers = set()
for match in matches:
# Split by comma and extract all numbers
numbers = re.findall(r'\d+', match)
citation_numbers.update(int(num) for num in numbers)
citation_numbers = sorted(list(citation_numbers))
logger.debug(f"Probable Citations found: {citation_numbers}")
return citation_numbers
def extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
"""Extract sources that were cited in the response"""
if not cited_numbers:
return []
cited_sources = []
for citation_num in cited_numbers:
source_index = citation_num - 1
if 0 <= source_index < len(processed_results):
source = processed_results[source_index].copy() # Make copy to avoid modifying original
source['_citation_number'] = citation_num # Preserve original citation number
cited_sources.append(source)
#logger.debug(f"Extracted citations : {cited_sources}")
return cited_sources
def clean_citations(response: str) -> str:
"""Normalize all citation formats to [x] and remove unwanted sections"""
# Remove References/Sources/Bibliography sections
ref_patterns = [
r'\n\s*#+\s*References?\s*:?.*$',
r'\n\s*#+\s*Sources?\s*:?.*$',
r'\n\s*#+\s*Bibliography\s*:?.*$',
r'\n\s*References?\s*:.*$',
r'\n\s*Sources?\s*:.*$',
r'\n\s*Bibliography\s*:.*$',
]
for pattern in ref_patterns:
response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
# Fix (Document X, Page Y, Year Z) -> [X]
response = re.sub(
r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix [Document X, Page Y, Year Z] -> [X]
response = re.sub(
r'\[Document\s+(\d+)(?:[^\]]*)\]',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix [Document X: filename, Page Y, Year Z] -> [X]
response = re.sub(
r'\[Document\s+(\d+):[^\]]+\]',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix [X.Y.Z] style (section numbers) -> [X]
response = re.sub(
r'\[(\d+)\.[\d\.]+\]',
r'[\1]',
response
)
# Fix (Document X) -> [X]
response = re.sub(
r'\(Document\s+(\d+)\)',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
response = re.sub(
r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Fix "Document X states/says/mentions" -> [X]
response = re.sub(
r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
r'[\1]',
response,
flags=re.IGNORECASE
)
# Clean up any double citations [[1]] -> [1]
response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
# Clean up multiple spaces
response = re.sub(r'\s+', ' ', response)
return response.strip()
def process_context(
context: List[Document],
metadata_fields_to_include: Optional[List[str]] = None
) -> Tuple[str, List[Dict[str, Any]]]:
"""
Processes LangChain Documents, extracts content and selected metadata,
and returns the formatted context string and the processed list of results.
Args:
context: A list of LangChain Document objects from the retriever.
metadata_fields_to_include: Optional list of metadata keys (e.g., ['source', 'page'])
to include in the formatted context string sent to the LLM.
Returns:
A tuple: (formatted_context_string, processed_results_list)
"""
logger.debug(f"Context Processing: \n Context: {context} \n Metadata_fileds: {metadata_fields_to_include}")
logger.info(f"Context Processing with Metadata_fileds_to_include: {metadata_fields_to_include}")
# 1. Input Validation
if not isinstance(context, list) or not all(isinstance(doc, Document) for doc in context):
# Raise a specific error if input is not what's expected
raise ValueError("Context must be a list of LangChain Document objects.")
if not context:
return "", []
processed_results = []
metadata_fields_to_include = metadata_fields_to_include or []
# 2. Standardize Structure and Build Context String
context_parts = []
for i, doc in enumerate(context, 1):
# The primary dictionary that holds all info for this document
doc_info = {
'answer': doc.page_content,
'__all_metadata__': doc.metadata, # Store all metadata for citation linking later
'_citation_number_key': i # Store the citation number
}
#logger.debug(f"DocInfo of {i} context: {doc_info}")
# Extract selected metadata fields for the prompt string
metadata_str_parts = []
for field in metadata_fields_to_include:
value = doc.metadata.get(field)
if value is not None:
# Store the value in the doc_info dict and format for the prompt string
doc_info[field] = value
# Format the field for readability in the prompt
field_name = field.replace('_', ' ').title()
metadata_str_parts.append(f"{field_name}: {value}")
# Build the document string
if metadata_str_parts:
metadata_line = " | ".join(metadata_str_parts)
# Example output: [1] (Type: decision, Meeting ID: 123)
context_str = f"[{i}] **This is Metadata** \n ({metadata_line})\n **Contextual Text** \n {doc.page_content}"
else:
context_str = f"[{i}]\n{doc.page_content}"
logger.debug(f" Updated Context {i}: {context_str}")
context_parts.append(context_str)
processed_results.append(doc_info) # Collect the standardized dict for later use
formatted_context = "\n---\n".join(context_parts)
return formatted_context, processed_results
def create_sources_list(
cited_sources: List[Dict[str, Any]],
title_metadata_fields: List[str],
link_metadata_field: str
) -> List[Dict[str, str]]:
"""
Create sources list for ChatUI format using configuration for title and link fields.
Args:
cited_sources: List of standardized dictionaries that were cited.
title_metadata_fields: List of metadata keys (e.g., ['document_type', 'decision_number'])
to use to build the source title.
link_metadata_field: The single metadata key (e.g., 'document_url') to use for the source link (URL).
"""
sources = []
logger.info("creating sources list for ChatUI")
logger.debug(f"Raw Cited sources: {cited_sources}")
for result in cited_sources:
# We access the original, full metadata dictionary
all_meta = result.get('__all_metadata__', {})
citation_num = result.get('_citation_number', 'N/A')
# 1. Build Title using configured fields
title_parts = []
for field in title_metadata_fields:
value = all_meta.get(field)
if value is not None:
title_parts.append(str(value))
# Create a descriptive title
title = " - ".join(title_parts) if title_parts else f"Source {citation_num}"
# 2. Extract Link using configured field (use 'doc://#' as fallback for empty/missing)
# ChatUI requires URLs to match doc://, http://, or https:// schemes
link = all_meta.get(link_metadata_field)
if not link or link == '#':
link = 'doc://#' # Use doc:// scheme for placeholder links
sources.append({
"uri": link,
"title": title
})
logger.debug(f"formatted cited sources :{sources}")
return sources |