File size: 4,948 Bytes
8b7e8f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | import hashlib
import os
import uuid
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import re
def generate_document_id() -> str:
"""Generate a unique document ID."""
return str(uuid.uuid4())
def generate_session_id() -> str:
"""Generate a unique session ID."""
return str(uuid.uuid4())
def calculate_file_hash(file_content: bytes) -> str:
"""Calculate SHA-256 hash of file content."""
return hashlib.sha256(file_content).hexdigest()
def sanitize_filename(filename: str) -> str:
"""Sanitize filename for safe storage."""
# Remove or replace dangerous characters
sanitized = re.sub(r"[^\w\-_\.]", "_", filename)
# Ensure it's not too long
if len(sanitized) > 255:
name, ext = os.path.splitext(sanitized)
sanitized = name[: 255 - len(ext)] + ext
return sanitized
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format."""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f} {size_names[i]}"
def extract_key_dates(text: str) -> List[Dict[str, Any]]:
"""Extract dates and deadlines from text."""
date_patterns = [
r"\b\d{1,2}/\d{1,2}/\d{4}\b", # MM/DD/YYYY
r"\b\d{1,2}-\d{1,2}-\d{4}\b", # MM-DD-YYYY
r"\b\d{4}-\d{1,2}-\d{1,2}\b", # YYYY-MM-DD
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b",
]
dates = []
for pattern in date_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
dates.append(
{
"date": match.group(),
"position": match.start(),
"context": text[max(0, match.start() - 50) : match.end() + 50],
}
)
return dates
def extract_financial_terms(text: str) -> Dict[str, Any]:
"""Extract financial information from text."""
financial_info = {}
# Extract monetary amounts (Indian Rupees and other currencies)
money_patterns = [
r"₹[\d,]+(?:\.\d{2})?", # Indian Rupees
r"Rs\.?\s*[\d,]+(?:\.\d{2})?", # Rs. format
r"\$[\d,]+(?:\.\d{2})?", # USD
]
amounts = []
for pattern in money_patterns:
amounts.extend(re.findall(pattern, text))
if amounts:
financial_info["amounts"] = amounts
# Extract percentages
percentage_pattern = r"\d+(?:\.\d+)?%"
percentages = re.findall(percentage_pattern, text)
if percentages:
financial_info["percentages"] = percentages
# Extract interest rates
interest_pattern = (
r"(?:interest rate|APR|annual percentage rate).*?(\d+(?:\.\d+)?%)"
)
interest_matches = re.findall(interest_pattern, text, re.IGNORECASE)
if interest_matches:
financial_info["interest_rates"] = interest_matches
return financial_info
def calculate_risk_score(risk_factors: List[Dict[str, Any]]) -> int:
"""Calculate overall risk score from individual risk factors."""
if not risk_factors:
return 0
risk_weights = {"critical": 25, "high": 15, "medium": 8, "low": 3}
total_score = 0
for factor in risk_factors:
severity = factor.get("severity", "low").lower()
total_score += risk_weights.get(severity, 0)
# Cap at 100
return min(total_score, 100)
def get_risk_color(risk_score: int) -> str:
"""Get color code based on risk score."""
if risk_score >= 75:
return "#FF4444" # Red
elif risk_score >= 50:
return "#FF8800" # Orange
elif risk_score >= 25:
return "#FFCC00" # Yellow
else:
return "#44AA44" # Green
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""Split text into overlapping chunks for processing."""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence boundary
if end < len(text):
last_period = chunk.rfind(".")
if last_period > chunk_size // 2:
chunk = chunk[: last_period + 1]
end = start + last_period + 1
chunks.append(chunk)
start = end - overlap
return chunks
def format_timestamp(timestamp: datetime) -> str:
"""Format timestamp for display."""
now = datetime.now()
diff = now - timestamp
if diff.days > 0:
return f"{diff.days} days ago"
elif diff.seconds > 3600:
hours = diff.seconds // 3600
return f"{hours} hours ago"
elif diff.seconds > 60:
minutes = diff.seconds // 60
return f"{minutes} minutes ago"
else:
return "Just now"
|