Spaces:
Sleeping
Sleeping
Update tweet_analyzer.py
Browse files- tweet_analyzer.py +15 -13
tweet_analyzer.py
CHANGED
|
@@ -21,6 +21,19 @@ class TweetDatasetProcessor:
|
|
| 21 |
self.vectorizer = TfidfVectorizer(stop_words='english')
|
| 22 |
self.used_tweets = set() # Track used tweets to avoid repetition
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def extract_text_from_pdf(self, pdf_path):
|
| 25 |
"""Extract text content from PDF file."""
|
| 26 |
reader = PdfReader(pdf_path)
|
|
@@ -35,7 +48,8 @@ class TweetDatasetProcessor:
|
|
| 35 |
raise ValueError("The uploaded PDF appears to be empty.")
|
| 36 |
|
| 37 |
lines = text.split('\n')
|
| 38 |
-
|
|
|
|
| 39 |
self.tweets = [tweet for tweet in clean_tweets if tweet]
|
| 40 |
|
| 41 |
if not self.tweets:
|
|
@@ -46,18 +60,6 @@ class TweetDatasetProcessor:
|
|
| 46 |
df.to_csv('processed_tweets.csv', index=False)
|
| 47 |
return df
|
| 48 |
|
| 49 |
-
def _process_line(self, line):
|
| 50 |
-
"""Process a single line in parallel."""
|
| 51 |
-
line = line.strip()
|
| 52 |
-
if not line or line.startswith('http'): # Skip empty lines and URLs
|
| 53 |
-
return None
|
| 54 |
-
return {
|
| 55 |
-
'content': line,
|
| 56 |
-
'timestamp': datetime.now(),
|
| 57 |
-
'mentions': self._extract_mentions(line),
|
| 58 |
-
'hashtags': self._extract_hashtags(line)
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
def _extract_mentions(self, text):
|
| 62 |
"""Extract mentioned users from tweet."""
|
| 63 |
return [word for word in text.split() if word.startswith('@')]
|
|
|
|
| 21 |
self.vectorizer = TfidfVectorizer(stop_words='english')
|
| 22 |
self.used_tweets = set() # Track used tweets to avoid repetition
|
| 23 |
|
| 24 |
+
@staticmethod
|
| 25 |
+
def _process_line(line):
|
| 26 |
+
"""Process a single line."""
|
| 27 |
+
line = line.strip()
|
| 28 |
+
if not line or line.startswith('http'): # Skip empty lines and URLs
|
| 29 |
+
return None
|
| 30 |
+
return {
|
| 31 |
+
'content': line,
|
| 32 |
+
'timestamp': datetime.now(),
|
| 33 |
+
'mentions': [word for word in line.split() if word.startswith('@')],
|
| 34 |
+
'hashtags': [word for word in line.split() if word.startswith('#')]
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
def extract_text_from_pdf(self, pdf_path):
|
| 38 |
"""Extract text content from PDF file."""
|
| 39 |
reader = PdfReader(pdf_path)
|
|
|
|
| 48 |
raise ValueError("The uploaded PDF appears to be empty.")
|
| 49 |
|
| 50 |
lines = text.split('\n')
|
| 51 |
+
# Pass the static method explicitly
|
| 52 |
+
clean_tweets = Parallel(n_jobs=-1)(delayed(TweetDatasetProcessor._process_line)(line) for line in lines)
|
| 53 |
self.tweets = [tweet for tweet in clean_tweets if tweet]
|
| 54 |
|
| 55 |
if not self.tweets:
|
|
|
|
| 60 |
df.to_csv('processed_tweets.csv', index=False)
|
| 61 |
return df
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def _extract_mentions(self, text):
|
| 64 |
"""Extract mentioned users from tweet."""
|
| 65 |
return [word for word in text.split() if word.startswith('@')]
|