File size: 2,192 Bytes
2f4af3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
from collections import Counter
from itertools import groupby

def is_gibberish(text):
    """Check if text appears to be gibberish"""
    if not text or not isinstance(text, str):
        return True
    
    words = re.findall(r"\w+", text.lower())
    if len(words) == 0:
        return True
    
    # Check for excessive repetition
    word_counts = Counter(words)
    if word_counts:
        most_common, count = word_counts.most_common(1)[0]
        if count > 12 or (count / len(words)) > 0.4:
            return True
    
    # Check minimum word count
    if len(words) < 1:
        return True
    if len(words) == 1 and len(words[0]) < 3:
        return True
    
    return False

def build_description_from_codes(codes):
    """Build description from Gardiner codes"""
    from config import Config
    config = Config()
    
    labels = [config.CODE_TO_LABEL.get(code, code) for code in codes]
    compressed = []
    
    for key, group in groupby(labels):
        count = len(list(group))
        name = "unknown" if (key == "?" or key is None) else key
        compressed.append(f"{name} (x{count})" if count > 1 else name)
    
    return ", ".join(compressed)

def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def extract_words(text, min_length=2):
    """Extract words from text with minimum length"""
    if not text:
        return []
    
    words = re.findall(r"\w+", text, flags=re.UNICODE)
    return [word for word in words if len(word) >= min_length]

def calculate_text_stats(text):
    """Calculate basic text statistics"""
    if not text:
        return {
            "char_count": 0,
            "word_count": 0,
            "unique_chars": 0,
            "avg_word_length": 0
        }
    
    words = extract_words(text)
    
    return {
        "char_count": len(text),
        "word_count": len(words),
        "unique_chars": len(set(text)),
        "avg_word_length": sum(len(word) for word in words) / max(1, len(words))
    }