Spaces:
Runtime error
Runtime error
| """ | |
| Extract evidence, dates, and relevant snippets from content. | |
| """ | |
| import re | |
| from typing import List, Dict | |
| from datetime import datetime | |
| def extract_dates(text: str) -> List[Dict[str, str]]: | |
| """Extract dates from text in various formats.""" | |
| dates = [] | |
| pattern1 = r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})\b' | |
| for match in re.finditer(pattern1, text, re.IGNORECASE): | |
| month, day, year = match.groups() | |
| date_str = f"{month} {day}, {year}" | |
| start = max(0, match.start() - 50) | |
| end = min(len(text), match.end() + 50) | |
| context = text[start:end].strip() | |
| dates.append({'date': date_str, 'context': context, 'raw': match.group()}) | |
| pattern2 = r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b' | |
| for match in re.finditer(pattern2, text): | |
| month, day, year = match.groups() | |
| date_str = f"{month}/{day}/{year}" | |
| start = max(0, match.start() - 50) | |
| end = min(len(text), match.end() + 50) | |
| context = text[start:end].strip() | |
| dates.append({'date': date_str, 'context': context, 'raw': match.group()}) | |
| pattern3 = r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b' | |
| for match in re.finditer(pattern3, text): | |
| year, month, day = match.groups() | |
| date_str = f"{month}/{day}/{year}" | |
| start = max(0, match.start() - 50) | |
| end = min(len(text), match.end() + 50) | |
| context = text[start:end].strip() | |
| dates.append({'date': date_str, 'context': context, 'raw': match.group()}) | |
| pattern4 = r'\b(Spring|Summer|Fall|Winter)\s+(\d{4})\b' | |
| for match in re.finditer(pattern4, text, re.IGNORECASE): | |
| season, year = match.groups() | |
| date_str = f"{season} {year}" | |
| start = max(0, match.start() - 50) | |
| end = min(len(text), match.end() + 50) | |
| context = text[start:end].strip() | |
| dates.append({'date': date_str, 'context': context, 'raw': match.group()}) | |
| seen = set() | |
| unique_dates = [] | |
| for d in dates: | |
| if d['date'] not in seen: | |
| seen.add(d['date']) | |
| unique_dates.append(d) | |
| return unique_dates | |
| def extract_relevant_snippets(content: str, query: str, max_snippets: int = 15, snippet_length: int = 300) -> List[str]: | |
| """Extract relevant snippets from content based on query keywords.""" | |
| keywords = [w.lower() for w in query.split() if len(w) > 3] | |
| if not keywords: | |
| sentences = re.split(r'[.!?]+', content) | |
| return [s.strip()[:snippet_length] for s in sentences[:max_snippets] if s.strip()] | |
| sentences = re.split(r'[.!?]+', content) | |
| scored_sentences = [] | |
| for sentence in sentences: | |
| sentence_lower = sentence.lower() | |
| score = sum(1 for keyword in keywords if keyword in sentence_lower) | |
| if score > 0: | |
| scored_sentences.append((score, sentence.strip())) | |
| scored_sentences.sort(reverse=True, key=lambda x: x[0]) | |
| snippets = [] | |
| seen = set() | |
| for score, sentence in scored_sentences[:max_snippets * 2]: | |
| snippet = sentence[:snippet_length] | |
| if snippet not in seen and len(snippet) > 50: | |
| seen.add(snippet) | |
| snippets.append(snippet) | |
| if len(snippets) >= max_snippets: | |
| break | |
| if len(snippets) < max_snippets: | |
| for sentence in sentences: | |
| snippet = sentence.strip()[:snippet_length] | |
| if snippet not in seen and len(snippet) > 50: | |
| seen.add(snippet) | |
| snippets.append(snippet) | |
| if len(snippets) >= max_snippets: | |
| break | |
| return snippets | |
| def extract_evidence(pages: List[Dict[str, str]], query: str) -> Dict: | |
| """Extract evidence from multiple pages.""" | |
| all_snippets = [] | |
| all_dates = [] | |
| for page in pages: | |
| content = page.get('content', '') | |
| if not content: | |
| continue | |
| snippets = extract_relevant_snippets(content, query, max_snippets=8, snippet_length=300) | |
| for snippet in snippets: | |
| all_snippets.append({ | |
| 'text': snippet, | |
| 'source_url': page['url'], | |
| 'source_title': page.get('title', 'No title') | |
| }) | |
| dates = extract_dates(content) | |
| for date_info in dates: | |
| date_info['source_url'] = page['url'] | |
| date_info['source_title'] = page.get('title', 'No title') | |
| all_dates.append(date_info) | |
| return {'snippets': all_snippets, 'dates': all_dates, 'pages': pages} | |