Spaces:
Sleeping
Sleeping
Update code
Browse files- stylometry.py +20 -16
stylometry.py
CHANGED
|
@@ -76,7 +76,7 @@ LEET_PATTERN = re.compile(r'\b\w*\d+\w*\b')
|
|
| 76 |
class AdvancedStyleFeatures:
|
| 77 |
"""Enhanced features extracted from a user's messages."""
|
| 78 |
|
| 79 |
-
def __init__(self, user_id:
|
| 80 |
self.user_id = user_id
|
| 81 |
self.user_name = user_name
|
| 82 |
self.message_count = 0
|
|
@@ -198,51 +198,55 @@ class AdvancedStylometryAnalyzer:
|
|
| 198 |
self._embedding_model = False # Mark as failed
|
| 199 |
return self._embedding_model if self._embedding_model else None
|
| 200 |
|
| 201 |
-
def get_active_users(self, min_messages: int = 300, days: int = 365) -> List[Tuple[
|
| 202 |
"""Get users active in the last N days with at least min_messages."""
|
| 203 |
cutoff_date = datetime.now() - timedelta(days=days)
|
| 204 |
-
|
| 205 |
|
| 206 |
conn = sqlite3.connect(self.db_path)
|
| 207 |
cursor = conn.cursor()
|
| 208 |
|
|
|
|
| 209 |
query = """
|
| 210 |
-
SELECT
|
| 211 |
-
FROM
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
| 215 |
HAVING msg_count >= ?
|
| 216 |
ORDER BY msg_count DESC
|
| 217 |
"""
|
| 218 |
|
| 219 |
-
cursor.execute(query, (
|
| 220 |
users = cursor.fetchall()
|
| 221 |
conn.close()
|
| 222 |
|
| 223 |
return users
|
| 224 |
|
| 225 |
-
def get_user_messages(self, user_id:
|
| 226 |
"""Get messages for a user (text, date)."""
|
| 227 |
cutoff_date = datetime.now() - timedelta(days=days)
|
| 228 |
-
|
| 229 |
|
| 230 |
conn = sqlite3.connect(self.db_path)
|
| 231 |
cursor = conn.cursor()
|
| 232 |
|
| 233 |
query = """
|
| 234 |
-
SELECT
|
| 235 |
-
WHERE
|
| 236 |
-
|
|
|
|
| 237 |
"""
|
| 238 |
|
| 239 |
-
cursor.execute(query, (user_id,
|
| 240 |
messages = cursor.fetchall()
|
| 241 |
conn.close()
|
| 242 |
|
| 243 |
return messages
|
| 244 |
|
| 245 |
-
def extract_features(self, user_id:
|
| 246 |
messages: List[Tuple[str, str]]) -> AdvancedStyleFeatures:
|
| 247 |
"""Extract comprehensive stylometric features from user messages."""
|
| 248 |
features = AdvancedStyleFeatures(user_id, user_name)
|
|
|
|
| 76 |
class AdvancedStyleFeatures:
|
| 77 |
"""Enhanced features extracted from a user's messages."""
|
| 78 |
|
| 79 |
+
def __init__(self, user_id: str, user_name: str):
|
| 80 |
self.user_id = user_id
|
| 81 |
self.user_name = user_name
|
| 82 |
self.message_count = 0
|
|
|
|
| 198 |
self._embedding_model = False # Mark as failed
|
| 199 |
return self._embedding_model if self._embedding_model else None
|
| 200 |
|
| 201 |
+
def get_active_users(self, min_messages: int = 300, days: int = 365) -> List[Tuple[str, str, int]]:
|
| 202 |
"""Get users active in the last N days with at least min_messages."""
|
| 203 |
cutoff_date = datetime.now() - timedelta(days=days)
|
| 204 |
+
cutoff_timestamp = int(cutoff_date.timestamp())
|
| 205 |
|
| 206 |
conn = sqlite3.connect(self.db_path)
|
| 207 |
cursor = conn.cursor()
|
| 208 |
|
| 209 |
+
# Use from_id and from_name directly from messages table
|
| 210 |
query = """
|
| 211 |
+
SELECT from_id, MAX(from_name) as name, COUNT(*) as msg_count
|
| 212 |
+
FROM messages
|
| 213 |
+
WHERE date_unixtime >= ?
|
| 214 |
+
AND from_id IS NOT NULL
|
| 215 |
+
AND text_plain IS NOT NULL
|
| 216 |
+
AND text_plain != ''
|
| 217 |
+
GROUP BY from_id
|
| 218 |
HAVING msg_count >= ?
|
| 219 |
ORDER BY msg_count DESC
|
| 220 |
"""
|
| 221 |
|
| 222 |
+
cursor.execute(query, (cutoff_timestamp, min_messages))
|
| 223 |
users = cursor.fetchall()
|
| 224 |
conn.close()
|
| 225 |
|
| 226 |
return users
|
| 227 |
|
| 228 |
+
def get_user_messages(self, user_id: str, days: int = 365) -> List[Tuple[str, str]]:
|
| 229 |
"""Get messages for a user (text, date)."""
|
| 230 |
cutoff_date = datetime.now() - timedelta(days=days)
|
| 231 |
+
cutoff_timestamp = int(cutoff_date.timestamp())
|
| 232 |
|
| 233 |
conn = sqlite3.connect(self.db_path)
|
| 234 |
cursor = conn.cursor()
|
| 235 |
|
| 236 |
query = """
|
| 237 |
+
SELECT text_plain, date FROM messages
|
| 238 |
+
WHERE from_id = ? AND date_unixtime >= ?
|
| 239 |
+
AND text_plain IS NOT NULL AND text_plain != ''
|
| 240 |
+
ORDER BY date_unixtime
|
| 241 |
"""
|
| 242 |
|
| 243 |
+
cursor.execute(query, (user_id, cutoff_timestamp))
|
| 244 |
messages = cursor.fetchall()
|
| 245 |
conn.close()
|
| 246 |
|
| 247 |
return messages
|
| 248 |
|
| 249 |
+
def extract_features(self, user_id: str, user_name: str,
|
| 250 |
messages: List[Tuple[str, str]]) -> AdvancedStyleFeatures:
|
| 251 |
"""Extract comprehensive stylometric features from user messages."""
|
| 252 |
features = AdvancedStyleFeatures(user_id, user_name)
|