Ragcore / app /core /metadata.py
NinjainPJs's picture
Initial deploy: RagCore RAG system with hybrid search and Gradio UI
a34068e
import logging
import re
from collections import Counter
from datetime import datetime
from pathlib import Path
from app.models.document import DocumentMetadata
logger = logging.getLogger(__name__)
DATE_PATTERNS = [
re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"),
re.compile(r"\b(\d{2}/\d{2}/\d{4})\b"),
re.compile(
r"\b((?:January|February|March|April|May|June|July|August|September|October|November|December)"
r"\s+\d{1,2},?\s+\d{4})\b"
),
]
DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%B %d, %Y", "%B %d %Y"]
def extract_title(text: str) -> str | None:
for line in text.splitlines():
line = line.strip()
if line and len(line) > 3:
return line[:200]
return None
def extract_dates(text: str) -> datetime | None:
for pattern in DATE_PATTERNS:
match = pattern.search(text[:2000]) # Only scan beginning
if match:
date_str = match.group(1)
for fmt in DATE_FORMATS:
try:
return datetime.strptime(date_str, fmt)
except ValueError:
continue
return None
def extract_tags(text: str, max_tags: int = 10) -> list[str]:
words = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)
counts = Counter(words)
tags = [word.lower() for word, count in counts.most_common(max_tags * 2) if count >= 2]
return tags[:max_tags]
def extract_metadata(raw_text: str, filename: str, page_count: int | None = None) -> DocumentMetadata:
ext = Path(filename).suffix.lower().lstrip(".")
doc_type = ext if ext else "unknown"
return DocumentMetadata(
source=filename,
doc_type=doc_type,
title=extract_title(raw_text),
created_date=extract_dates(raw_text),
tags=extract_tags(raw_text),
page_count=page_count,
)