Lega.AI / src /utils /helpers.py
CoderNoah
Initial commit
8b7e8f0
import hashlib
import os
import uuid
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import re
def generate_document_id() -> str:
"""Generate a unique document ID."""
return str(uuid.uuid4())
def generate_session_id() -> str:
"""Generate a unique session ID."""
return str(uuid.uuid4())
def calculate_file_hash(file_content: bytes) -> str:
"""Calculate SHA-256 hash of file content."""
return hashlib.sha256(file_content).hexdigest()
def sanitize_filename(filename: str) -> str:
"""Sanitize filename for safe storage."""
# Remove or replace dangerous characters
sanitized = re.sub(r"[^\w\-_\.]", "_", filename)
# Ensure it's not too long
if len(sanitized) > 255:
name, ext = os.path.splitext(sanitized)
sanitized = name[: 255 - len(ext)] + ext
return sanitized
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format."""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f} {size_names[i]}"
def extract_key_dates(text: str) -> List[Dict[str, Any]]:
"""Extract dates and deadlines from text."""
date_patterns = [
r"\b\d{1,2}/\d{1,2}/\d{4}\b", # MM/DD/YYYY
r"\b\d{1,2}-\d{1,2}-\d{4}\b", # MM-DD-YYYY
r"\b\d{4}-\d{1,2}-\d{1,2}\b", # YYYY-MM-DD
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b",
]
dates = []
for pattern in date_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
dates.append(
{
"date": match.group(),
"position": match.start(),
"context": text[max(0, match.start() - 50) : match.end() + 50],
}
)
return dates
def extract_financial_terms(text: str) -> Dict[str, Any]:
"""Extract financial information from text."""
financial_info = {}
# Extract monetary amounts (Indian Rupees and other currencies)
money_patterns = [
r"₹[\d,]+(?:\.\d{2})?", # Indian Rupees
r"Rs\.?\s*[\d,]+(?:\.\d{2})?", # Rs. format
r"\$[\d,]+(?:\.\d{2})?", # USD
]
amounts = []
for pattern in money_patterns:
amounts.extend(re.findall(pattern, text))
if amounts:
financial_info["amounts"] = amounts
# Extract percentages
percentage_pattern = r"\d+(?:\.\d+)?%"
percentages = re.findall(percentage_pattern, text)
if percentages:
financial_info["percentages"] = percentages
# Extract interest rates
interest_pattern = (
r"(?:interest rate|APR|annual percentage rate).*?(\d+(?:\.\d+)?%)"
)
interest_matches = re.findall(interest_pattern, text, re.IGNORECASE)
if interest_matches:
financial_info["interest_rates"] = interest_matches
return financial_info
def calculate_risk_score(risk_factors: List[Dict[str, Any]]) -> int:
"""Calculate overall risk score from individual risk factors."""
if not risk_factors:
return 0
risk_weights = {"critical": 25, "high": 15, "medium": 8, "low": 3}
total_score = 0
for factor in risk_factors:
severity = factor.get("severity", "low").lower()
total_score += risk_weights.get(severity, 0)
# Cap at 100
return min(total_score, 100)
def get_risk_color(risk_score: int) -> str:
"""Get color code based on risk score."""
if risk_score >= 75:
return "#FF4444" # Red
elif risk_score >= 50:
return "#FF8800" # Orange
elif risk_score >= 25:
return "#FFCC00" # Yellow
else:
return "#44AA44" # Green
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""Split text into overlapping chunks for processing."""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence boundary
if end < len(text):
last_period = chunk.rfind(".")
if last_period > chunk_size // 2:
chunk = chunk[: last_period + 1]
end = start + last_period + 1
chunks.append(chunk)
start = end - overlap
return chunks
def format_timestamp(timestamp: datetime) -> str:
"""Format timestamp for display."""
now = datetime.now()
diff = now - timestamp
if diff.days > 0:
return f"{diff.days} days ago"
elif diff.seconds > 3600:
hours = diff.seconds // 3600
return f"{hours} hours ago"
elif diff.seconds > 60:
minutes = diff.seconds // 60
return f"{minutes} minutes ago"
else:
return "Just now"