File size: 4,948 Bytes
8b7e8f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import hashlib
import os
import uuid
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import re


def generate_document_id() -> str:
    """Generate a unique document ID."""
    return str(uuid.uuid4())


def generate_session_id() -> str:
    """Generate a unique session ID."""
    return str(uuid.uuid4())


def calculate_file_hash(file_content: bytes) -> str:
    """Calculate SHA-256 hash of file content."""
    return hashlib.sha256(file_content).hexdigest()


def sanitize_filename(filename: str) -> str:
    """Sanitize filename for safe storage."""
    # Remove or replace dangerous characters
    sanitized = re.sub(r"[^\w\-_\.]", "_", filename)
    # Ensure it's not too long
    if len(sanitized) > 255:
        name, ext = os.path.splitext(sanitized)
        sanitized = name[: 255 - len(ext)] + ext
    return sanitized


def format_file_size(size_bytes: int) -> str:
    """Format file size in human readable format."""
    if size_bytes == 0:
        return "0 B"

    size_names = ["B", "KB", "MB", "GB"]
    i = 0
    while size_bytes >= 1024 and i < len(size_names) - 1:
        size_bytes /= 1024.0
        i += 1

    return f"{size_bytes:.1f} {size_names[i]}"


def extract_key_dates(text: str) -> List[Dict[str, Any]]:
    """Extract dates and deadlines from text."""
    date_patterns = [
        r"\b\d{1,2}/\d{1,2}/\d{4}\b",  # MM/DD/YYYY
        r"\b\d{1,2}-\d{1,2}-\d{4}\b",  # MM-DD-YYYY
        r"\b\d{4}-\d{1,2}-\d{1,2}\b",  # YYYY-MM-DD
        r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b",
    ]

    dates = []
    for pattern in date_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        for match in matches:
            dates.append(
                {
                    "date": match.group(),
                    "position": match.start(),
                    "context": text[max(0, match.start() - 50) : match.end() + 50],
                }
            )

    return dates


def extract_financial_terms(text: str) -> Dict[str, Any]:
    """Extract financial information from text."""
    financial_info = {}

    # Extract monetary amounts (Indian Rupees and other currencies)
    money_patterns = [
        r"₹[\d,]+(?:\.\d{2})?",  # Indian Rupees
        r"Rs\.?\s*[\d,]+(?:\.\d{2})?",  # Rs. format
        r"\$[\d,]+(?:\.\d{2})?",  # USD
    ]

    amounts = []
    for pattern in money_patterns:
        amounts.extend(re.findall(pattern, text))

    if amounts:
        financial_info["amounts"] = amounts

    # Extract percentages
    percentage_pattern = r"\d+(?:\.\d+)?%"
    percentages = re.findall(percentage_pattern, text)
    if percentages:
        financial_info["percentages"] = percentages

    # Extract interest rates
    interest_pattern = (
        r"(?:interest rate|APR|annual percentage rate).*?(\d+(?:\.\d+)?%)"
    )
    interest_matches = re.findall(interest_pattern, text, re.IGNORECASE)
    if interest_matches:
        financial_info["interest_rates"] = interest_matches

    return financial_info


def calculate_risk_score(risk_factors: List[Dict[str, Any]]) -> int:
    """Calculate overall risk score from individual risk factors."""
    if not risk_factors:
        return 0

    risk_weights = {"critical": 25, "high": 15, "medium": 8, "low": 3}

    total_score = 0
    for factor in risk_factors:
        severity = factor.get("severity", "low").lower()
        total_score += risk_weights.get(severity, 0)

    # Cap at 100
    return min(total_score, 100)


def get_risk_color(risk_score: int) -> str:
    """Get color code based on risk score."""
    if risk_score >= 75:
        return "#FF4444"  # Red
    elif risk_score >= 50:
        return "#FF8800"  # Orange
    elif risk_score >= 25:
        return "#FFCC00"  # Yellow
    else:
        return "#44AA44"  # Green


def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """Split text into overlapping chunks for processing."""
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]

        # Try to break at sentence boundary
        if end < len(text):
            last_period = chunk.rfind(".")
            if last_period > chunk_size // 2:
                chunk = chunk[: last_period + 1]
                end = start + last_period + 1

        chunks.append(chunk)
        start = end - overlap

    return chunks


def format_timestamp(timestamp: datetime) -> str:
    """Format timestamp for display."""
    now = datetime.now()
    diff = now - timestamp

    if diff.days > 0:
        return f"{diff.days} days ago"
    elif diff.seconds > 3600:
        hours = diff.seconds // 3600
        return f"{hours} hours ago"
    elif diff.seconds > 60:
        minutes = diff.seconds // 60
        return f"{minutes} minutes ago"
    else:
        return "Just now"