Spaces:
Sleeping
Sleeping
| import logging | |
| import re | |
| from collections import Counter | |
| from datetime import datetime | |
| from pathlib import Path | |
| from app.models.document import DocumentMetadata | |
| logger = logging.getLogger(__name__) | |
| DATE_PATTERNS = [ | |
| re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"), | |
| re.compile(r"\b(\d{2}/\d{2}/\d{4})\b"), | |
| re.compile( | |
| r"\b((?:January|February|March|April|May|June|July|August|September|October|November|December)" | |
| r"\s+\d{1,2},?\s+\d{4})\b" | |
| ), | |
| ] | |
| DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%B %d, %Y", "%B %d %Y"] | |
| def extract_title(text: str) -> str | None: | |
| for line in text.splitlines(): | |
| line = line.strip() | |
| if line and len(line) > 3: | |
| return line[:200] | |
| return None | |
| def extract_dates(text: str) -> datetime | None: | |
| for pattern in DATE_PATTERNS: | |
| match = pattern.search(text[:2000]) # Only scan beginning | |
| if match: | |
| date_str = match.group(1) | |
| for fmt in DATE_FORMATS: | |
| try: | |
| return datetime.strptime(date_str, fmt) | |
| except ValueError: | |
| continue | |
| return None | |
| def extract_tags(text: str, max_tags: int = 10) -> list[str]: | |
| words = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text) | |
| counts = Counter(words) | |
| tags = [word.lower() for word, count in counts.most_common(max_tags * 2) if count >= 2] | |
| return tags[:max_tags] | |
| def extract_metadata(raw_text: str, filename: str, page_count: int | None = None) -> DocumentMetadata: | |
| ext = Path(filename).suffix.lower().lstrip(".") | |
| doc_type = ext if ext else "unknown" | |
| return DocumentMetadata( | |
| source=filename, | |
| doc_type=doc_type, | |
| title=extract_title(raw_text), | |
| created_date=extract_dates(raw_text), | |
| tags=extract_tags(raw_text), | |
| page_count=page_count, | |
| ) | |