sebasmos's picture
Deploy Sherlock
d6f13c4
"""
Meeting note parsers for extracting structured data from markdown files.
"""
from pathlib import Path
from typing import List, Optional
from datetime import datetime
from pydantic import BaseModel, Field
import re
class ActionItem(BaseModel):
"""Represents an action item from a meeting."""
task: str
assignee: Optional[str] = None
deadline: Optional[str] = None
completed: bool = False
class MeetingNote(BaseModel):
"""Represents a parsed meeting note."""
project_name: str
title: str
date: Optional[datetime] = None
participants: List[str] = Field(default_factory=list)
discussion: Optional[str] = None
decisions: List[str] = Field(default_factory=list)
action_items: List[ActionItem] = Field(default_factory=list)
blockers: List[str] = Field(default_factory=list)
file_path: str
class MeetingParser:
"""Parser for markdown meeting notes."""
@staticmethod
def parse_date(date_str: str) -> Optional[datetime]:
"""Parse date from various formats."""
date_formats = [
"%Y-%m-%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%B %d, %Y",
"%b %d, %Y",
"%Y/%m/%d"
]
for fmt in date_formats:
try:
return datetime.strptime(date_str.strip(), fmt)
except ValueError:
continue
return None
@staticmethod
def parse_action_item(line: str) -> Optional[ActionItem]:
"""Parse an action item line."""
# Match patterns like:
# - [ ] Task
# - [x] Task
# - [ ] Alice: Task by Jan 20
# - [x] Bob: Task (by 2025-01-20)
completed = False
if "[x]" in line.lower() or "[✓]" in line or "[✔]" in line:
completed = True
# Remove checkbox markers
line = re.sub(r'\[[ xX✓✔]\]', '', line).strip()
line = line.lstrip('- ').strip()
if not line:
return None
# Try to extract assignee
assignee = None
assignee_match = re.match(r'^([A-Za-z\s]+):\s*(.+)$', line)
if assignee_match:
assignee = assignee_match.group(1).strip()
line = assignee_match.group(2).strip()
# Try to extract deadline
deadline = None
deadline_patterns = [
r'by\s+([A-Za-z]+\s+\d{1,2}(?:,\s+\d{4})?)',
r'by\s+(\d{4}-\d{2}-\d{2})',
r'\(by\s+([^)]+)\)',
]
for pattern in deadline_patterns:
deadline_match = re.search(pattern, line, re.IGNORECASE)
if deadline_match:
deadline = deadline_match.group(1).strip()
line = re.sub(pattern, '', line, flags=re.IGNORECASE).strip()
break
return ActionItem(
task=line,
assignee=assignee,
deadline=deadline,
completed=completed
)
@staticmethod
def parse(file_path: Path, project_name: str) -> Optional[MeetingNote]:
"""Parse a markdown meeting note file."""
if not file_path.exists():
return None
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
# Initialize fields
title = file_path.stem.replace('-', ' ').replace('_', ' ').title()
date = None
participants = []
discussion = []
decisions = []
action_items = []
blockers = []
current_section = None
for line in lines:
line_stripped = line.strip()
# Skip empty lines
if not line_stripped:
continue
# Check for title
if line_stripped.startswith('# '):
title = line_stripped[2:].strip()
# Try to extract from "Meeting: X" format
if title.lower().startswith('meeting:'):
title = title[8:].strip()
continue
# Check for metadata
if line_stripped.lower().startswith('date:'):
date_str = line_stripped[5:].strip()
date = MeetingParser.parse_date(date_str)
continue
if line_stripped.lower().startswith('participants:'):
participants_str = line_stripped[13:].strip()
participants = [p.strip() for p in participants_str.split(',')]
continue
# Check for sections
if line_stripped.startswith('## '):
section_name = line_stripped[3:].strip().lower()
if 'discussion' in section_name or 'notes' in section_name:
current_section = 'discussion'
elif 'decision' in section_name:
current_section = 'decisions'
elif 'action' in section_name or 'todo' in section_name or 'task' in section_name:
current_section = 'action_items'
elif 'blocker' in section_name or 'issue' in section_name:
current_section = 'blockers'
else:
current_section = 'discussion'
continue
# Add content to current section
if current_section == 'discussion':
discussion.append(line_stripped)
elif current_section == 'decisions':
if line_stripped.startswith('-') or line_stripped.startswith('*'):
decisions.append(line_stripped.lstrip('-*').strip())
elif current_section == 'action_items':
if '[' in line_stripped:
action_item = MeetingParser.parse_action_item(line_stripped)
if action_item:
action_items.append(action_item)
elif current_section == 'blockers':
if line_stripped.startswith('-') or line_stripped.startswith('*'):
blockers.append(line_stripped.lstrip('-*').strip())
return MeetingNote(
project_name=project_name,
title=title,
date=date,
participants=participants,
discussion='\n'.join(discussion) if discussion else None,
decisions=decisions,
action_items=action_items,
blockers=blockers,
file_path=str(file_path)
)
def load_meetings_from_directory(data_dir: Path) -> List[MeetingNote]:
"""Load all meeting notes from a directory structure."""
meetings = []
if not data_dir.exists():
return meetings
# Expected structure: data_dir/project_name/meetings/*.md
for project_dir in data_dir.iterdir():
if not project_dir.is_dir():
continue
project_name = project_dir.name
meetings_dir = project_dir / "meetings"
if not meetings_dir.exists():
continue
for meeting_file in meetings_dir.glob("*.md"):
meeting = MeetingParser.parse(meeting_file, project_name)
if meeting:
meetings.append(meeting)
return meetings