Spaces:
Sleeping
Sleeping
File size: 2,192 Bytes
2f4af3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import re
from collections import Counter
from itertools import groupby
def is_gibberish(text):
"""Check if text appears to be gibberish"""
if not text or not isinstance(text, str):
return True
words = re.findall(r"\w+", text.lower())
if len(words) == 0:
return True
# Check for excessive repetition
word_counts = Counter(words)
if word_counts:
most_common, count = word_counts.most_common(1)[0]
if count > 12 or (count / len(words)) > 0.4:
return True
# Check minimum word count
if len(words) < 1:
return True
if len(words) == 1 and len(words[0]) < 3:
return True
return False
def build_description_from_codes(codes):
"""Build description from Gardiner codes"""
from config import Config
config = Config()
labels = [config.CODE_TO_LABEL.get(code, code) for code in codes]
compressed = []
for key, group in groupby(labels):
count = len(list(group))
name = "unknown" if (key == "?" or key is None) else key
compressed.append(f"{name} (x{count})" if count > 1 else name)
return ", ".join(compressed)
def clean_text(text):
"""Clean and normalize text"""
if not text:
return ""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
def extract_words(text, min_length=2):
"""Extract words from text with minimum length"""
if not text:
return []
words = re.findall(r"\w+", text, flags=re.UNICODE)
return [word for word in words if len(word) >= min_length]
def calculate_text_stats(text):
"""Calculate basic text statistics"""
if not text:
return {
"char_count": 0,
"word_count": 0,
"unique_chars": 0,
"avg_word_length": 0
}
words = extract_words(text)
return {
"char_count": len(text),
"word_count": len(words),
"unique_chars": len(set(text)),
"avg_word_length": sum(len(word) for word in words) / max(1, len(words))
}
|