My_campus_agent / extract.py
BenjaminKaindu0506's picture
Initial commit: UA Student Navigator Chatbot with OpenRouter integration
1be7393
"""
Extract evidence, dates, and relevant snippets from content.
"""
import re
from typing import List, Dict
from datetime import datetime
def extract_dates(text: str) -> List[Dict[str, str]]:
"""Extract dates from text in various formats."""
dates = []
pattern1 = r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})\b'
for match in re.finditer(pattern1, text, re.IGNORECASE):
month, day, year = match.groups()
date_str = f"{month} {day}, {year}"
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 50)
context = text[start:end].strip()
dates.append({'date': date_str, 'context': context, 'raw': match.group()})
pattern2 = r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b'
for match in re.finditer(pattern2, text):
month, day, year = match.groups()
date_str = f"{month}/{day}/{year}"
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 50)
context = text[start:end].strip()
dates.append({'date': date_str, 'context': context, 'raw': match.group()})
pattern3 = r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b'
for match in re.finditer(pattern3, text):
year, month, day = match.groups()
date_str = f"{month}/{day}/{year}"
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 50)
context = text[start:end].strip()
dates.append({'date': date_str, 'context': context, 'raw': match.group()})
pattern4 = r'\b(Spring|Summer|Fall|Winter)\s+(\d{4})\b'
for match in re.finditer(pattern4, text, re.IGNORECASE):
season, year = match.groups()
date_str = f"{season} {year}"
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 50)
context = text[start:end].strip()
dates.append({'date': date_str, 'context': context, 'raw': match.group()})
seen = set()
unique_dates = []
for d in dates:
if d['date'] not in seen:
seen.add(d['date'])
unique_dates.append(d)
return unique_dates
def extract_relevant_snippets(content: str, query: str, max_snippets: int = 15, snippet_length: int = 300) -> List[str]:
"""Extract relevant snippets from content based on query keywords."""
keywords = [w.lower() for w in query.split() if len(w) > 3]
if not keywords:
sentences = re.split(r'[.!?]+', content)
return [s.strip()[:snippet_length] for s in sentences[:max_snippets] if s.strip()]
sentences = re.split(r'[.!?]+', content)
scored_sentences = []
for sentence in sentences:
sentence_lower = sentence.lower()
score = sum(1 for keyword in keywords if keyword in sentence_lower)
if score > 0:
scored_sentences.append((score, sentence.strip()))
scored_sentences.sort(reverse=True, key=lambda x: x[0])
snippets = []
seen = set()
for score, sentence in scored_sentences[:max_snippets * 2]:
snippet = sentence[:snippet_length]
if snippet not in seen and len(snippet) > 50:
seen.add(snippet)
snippets.append(snippet)
if len(snippets) >= max_snippets:
break
if len(snippets) < max_snippets:
for sentence in sentences:
snippet = sentence.strip()[:snippet_length]
if snippet not in seen and len(snippet) > 50:
seen.add(snippet)
snippets.append(snippet)
if len(snippets) >= max_snippets:
break
return snippets
def extract_evidence(pages: List[Dict[str, str]], query: str) -> Dict:
"""Extract evidence from multiple pages."""
all_snippets = []
all_dates = []
for page in pages:
content = page.get('content', '')
if not content:
continue
snippets = extract_relevant_snippets(content, query, max_snippets=8, snippet_length=300)
for snippet in snippets:
all_snippets.append({
'text': snippet,
'source_url': page['url'],
'source_title': page.get('title', 'No title')
})
dates = extract_dates(content)
for date_info in dates:
date_info['source_url'] = page['url']
date_info['source_title'] = page.get('title', 'No title')
all_dates.append(date_info)
return {'snippets': all_snippets, 'dates': all_dates, 'pages': pages}