File size: 8,539 Bytes
b309c22
 
 
 
 
770544d
b309c22
 
 
 
770544d
b309c22
 
 
 
 
770544d
b309c22
 
770544d
b309c22
 
 
 
 
 
770544d
b309c22
 
 
770544d
b309c22
 
 
770544d
b309c22
 
770544d
 
b309c22
 
770544d
b309c22
 
770544d
b309c22
 
 
 
 
 
770544d
b309c22
 
770544d
b309c22
 
 
 
 
770544d
b309c22
 
770544d
b309c22
 
770544d
b309c22
770544d
 
b309c22
770544d
 
 
b309c22
770544d
 
b309c22
 
770544d
b309c22
 
 
 
 
 
770544d
b309c22
 
770544d
b309c22
 
 
 
770544d
 
 
 
 
 
 
 
b309c22
770544d
b309c22
770544d
b309c22
770544d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b309c22
770544d
 
 
 
 
 
 
 
 
 
 
 
 
b309c22
770544d
 
 
 
b309c22
770544d
 
 
b309c22
 
770544d
b309c22
770544d
b309c22
770544d
 
b309c22
770544d
 
 
 
 
 
 
 
 
 
b309c22
 
 
 
 
 
770544d
b309c22
 
 
770544d
b309c22
 
 
 
 
770544d
b309c22
770544d
 
b309c22
770544d
 
 
 
 
b309c22
 
 
 
 
 
770544d
b309c22
 
 
 
770544d
b309c22
 
 
 
 
770544d
b309c22
 
770544d
b309c22
 
770544d
 
b309c22
 
 
 
 
 
770544d
 
 
b309c22
 
770544d
b309c22
 
 
770544d
b309c22
 
 
 
770544d
b309c22
 
 
770544d
b309c22
 
770544d
 
 
 
 
 
 
 
 
 
 
b309c22
 
 
 
 
 
770544d
b309c22
 
 
770544d
b309c22
 
 
 
 
770544d
b309c22
 
 
770544d
b309c22
 
 
770544d
b309c22
 
 
 
 
 
770544d
b309c22
 
770544d
b309c22
 
 
 
770544d
 
b309c22
770544d
 
b309c22
 
770544d
b309c22
770544d
 
b309c22
770544d
 
b309c22
770544d
 
 
 
b309c22
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""
Utility functions for Smart Auto-Complete
Provides common functionality for text processing, logging, and validation
"""

import html
import logging
import re
import sys
import unicodedata
from typing import Dict, List, Optional, Tuple


def setup_logging(level: str = "INFO") -> logging.Logger:
    """
    Set up logging configuration for the application

    Args:
        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)

    Returns:
        Configured logger instance
    """
    # Create logger
    logger = logging.getLogger("smart_autocomplete")
    logger.setLevel(getattr(logging, level.upper()))

    # Remove existing handlers to avoid duplicates
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

    # Create console handler with formatting
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(getattr(logging, level.upper()))

    # Create formatter
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
    console_handler.setFormatter(formatter)

    # Add handler to logger
    logger.addHandler(console_handler)

    return logger


def sanitize_input(text: str) -> str:
    """
    Sanitize and clean input text for processing

    Args:
        text: Raw input text

    Returns:
        Cleaned and sanitized text
    """
    if not text:
        return ""

    # Convert to string if not already
    text = str(text)

    # HTML escape to prevent injection
    text = html.escape(text)

    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove excessive whitespace but preserve structure
    text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)  # Max 2 consecutive newlines
    text = re.sub(r"[ \t]+", " ", text)  # Multiple spaces/tabs to single space

    # Remove control characters except newlines and tabs
    text = "".join(char for char in text if ord(char) >= 32 or char in "\n\t")

    # Trim leading/trailing whitespace
    text = text.strip()

    return text


def extract_context_hints(text: str) -> Dict[str, any]:
    """
    Extract contextual hints from the input text to improve suggestions

    Args:
        text: Input text to analyze

    Returns:
        Dictionary containing context hints
    """
    hints = {
        "length": len(text),
        "word_count": len(text.split()),
        "has_greeting": False,
        "has_signature": False,
        "has_code_markers": False,
        "has_questions": False,
        "tone": "neutral",
        "language_style": "linkedin",
    }

    text_lower = text.lower()

    # Check for email patterns
    email_greetings = [
        "dear",
        "hello",
        "hi",
        "greetings",
        "good morning",
        "good afternoon",
    ]
    email_signatures = [
        "sincerely",
        "best regards",
        "thank you",
        "yours truly",
        "kind regards",
    ]

    hints["has_greeting"] = any(greeting in text_lower for greeting in email_greetings)
    hints["has_signature"] = any(
        signature in text_lower for signature in email_signatures
    )

    # Check for code patterns
    code_markers = [
        "//",
        "/*",
        "*/",
        "#",
        "def ",
        "function",
        "class ",
        "import ",
        "from ",
    ]
    hints["has_code_markers"] = any(marker in text_lower for marker in code_markers)

    # Check for questions
    hints["has_questions"] = "?" in text or any(
        q in text_lower for q in ["what", "how", "why", "when", "where", "who"]
    )

    # Determine tone
    formal_words = ["please", "kindly", "respectfully", "sincerely", "professional"]
    casual_words = ["hey", "yeah", "cool", "awesome", "thanks"]

    formal_count = sum(1 for word in formal_words if word in text_lower)
    casual_count = sum(1 for word in casual_words if word in text_lower)

    if formal_count > casual_count:
        hints["tone"] = "formal"
    elif casual_count > formal_count:
        hints["tone"] = "casual"

    # Determine language style
    if hints["has_code_markers"]:
        hints["language_style"] = "technical"
    elif hints["has_greeting"] or hints["has_signature"]:
        hints["language_style"] = "business"
    elif any(
        creative in text_lower
        for creative in ["once upon", "story", "character", "plot"]
    ):
        hints["language_style"] = "creative"

    return hints


def validate_api_key(api_key: str, provider: str) -> bool:
    """
    Validate API key format for different providers

    Args:
        api_key: The API key to validate
        provider: The provider name (openai, anthropic)

    Returns:
        True if the key format is valid, False otherwise
    """
    if not api_key or not isinstance(api_key, str):
        return False

    api_key = api_key.strip()

    if provider.lower() == "openai":
        # OpenAI keys start with 'sk-' and are typically 51 characters
        return api_key.startswith("sk-") and len(api_key) >= 40
    elif provider.lower() == "anthropic":
        # Anthropic keys start with 'sk-ant-'
        return api_key.startswith("sk-ant-") and len(api_key) >= 40

    return False


def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str:
    """
    Truncate text to a maximum length while optionally preserving word boundaries

    Args:
        text: Text to truncate
        max_length: Maximum allowed length
        preserve_words: Whether to preserve word boundaries

    Returns:
        Truncated text
    """
    if len(text) <= max_length:
        return text

    if not preserve_words:
        return text[:max_length].rstrip() + "..."

    # Find the last space before the max_length
    truncated = text[:max_length]
    last_space = truncated.rfind(" ")

    if last_space > max_length * 0.8:  # Only use word boundary if it's not too far back
        return text[:last_space].rstrip() + "..."
    else:
        return text[:max_length].rstrip() + "..."


def format_suggestions_for_display(
    suggestions: List[str], max_display_length: int = 100
) -> List[Dict[str, str]]:
    """
    Format suggestions for display in the UI

    Args:
        suggestions: List of suggestion strings
        max_display_length: Maximum length for display

    Returns:
        List of formatted suggestion dictionaries
    """
    formatted = []

    for i, suggestion in enumerate(suggestions, 1):
        # Clean the suggestion
        clean_suggestion = sanitize_input(suggestion)

        # Create display version (truncated if needed)
        display_text = truncate_text(clean_suggestion, max_display_length)

        formatted.append(
            {
                "id": i,
                "text": clean_suggestion,
                "display_text": display_text,
                "length": len(clean_suggestion),
                "word_count": len(clean_suggestion.split()),
            }
        )

    return formatted


def calculate_text_similarity(text1: str, text2: str) -> float:
    """
    Calculate similarity between two texts using simple word overlap

    Args:
        text1: First text
        text2: Second text

    Returns:
        Similarity score between 0 and 1
    """
    if not text1 or not text2:
        return 0.0

    # Convert to lowercase and split into words
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())

    # Calculate Jaccard similarity
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))

    return intersection / union if union > 0 else 0.0


def get_text_stats(text: str) -> Dict[str, int]:
    """
    Get basic statistics about the text

    Args:
        text: Text to analyze

    Returns:
        Dictionary with text statistics
    """
    if not text:
        return {"characters": 0, "words": 0, "sentences": 0, "paragraphs": 0}

    # Count characters (excluding whitespace)
    char_count = len(text.replace(" ", "").replace("\n", "").replace("\t", ""))

    # Count words
    word_count = len(text.split())

    # Count sentences (rough estimate)
    sentence_count = len(re.findall(r"[.!?]+", text))

    # Count paragraphs
    paragraph_count = len([p for p in text.split("\n\n") if p.strip()])

    return {
        "characters": char_count,
        "words": word_count,
        "sentences": max(1, sentence_count),  # At least 1 sentence
        "paragraphs": max(1, paragraph_count),  # At least 1 paragraph
    }