Phonalitics / helper.py
stanlys96's picture
Upload 51 files
c7c7e3e verified
import re
from dotenv import load_dotenv
import os
from googleapiclient.discovery import build
import nltk
from nltk.tokenize import word_tokenize
import json
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
load_dotenv()
api_key = os.getenv('API_KEY')
youtube = build('youtube', 'v3', developerKey=api_key)
def get_all_comments(video_id):
comments = []
next_page_token = None
while True:
# Make API call to get comments
request = youtube.commentThreads().list(
part='snippet',
videoId=video_id,
textFormat='plainText',
pageToken=next_page_token # Use pagination token for next set of comments
)
# Execute the request
response = request.execute()
# Loop through the comments in the response
for item in response['items']:
comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
timestamp = item['snippet']['topLevelComment']['snippet']['publishedAt']
like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
comments.append({
'author': author.strip(),
'comment': comment.strip(),
'timestamp': timestamp.strip(),
'like_count': like_count,
})
# Check if there's another page of comments (pagination)
next_page_token = response.get('nextPageToken')
if not next_page_token or len(comments) >= 100: # If no more pages, break the loop
break
return comments
def extract_youtube_id(url_or_id):
pattern = r'(?:v=|\/)([a-zA-Z0-9_-]{11})(?:&|$)?'
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url_or_id):
return url_or_id
match = re.search(pattern, url_or_id)
if match:
return match.group(1)
return None
informal_phrases = {
"sat set sat set": "cepat", "ya mas": ""
}
def load_slang_txt(file_path):
slang_dict_txt = {}
try:
with open(file_path, 'r', encoding='utf-8') as file:
file_content = file.read()
slang_dict_txt = json.loads(file_content)
except json.JSONDecodeError:
print(f"Error decoding JSON in the file: {file_path}")
return slang_dict_txt
def load_slang_csv(file_path):
slang_df = pd.read_csv(file_path, encoding='ISO-8859-1')
return dict(zip(slang_df.iloc[:, 0], slang_df.iloc[:, 1]))
# Combine slang dictionaries
slang_txt_path = 'combined_slang_words.txt'
slang_dict_txt = load_slang_txt(slang_txt_path)
slang_csv_path = 'new_kamusalay.csv'
slang_dict_csv = load_slang_csv(slang_csv_path)
slang_dict_tambahan = {
"gw": "saya", "mau": "ingin", "ni": "ini", "aja": "saja", "gak": "tidak", "bgt": "sangat",
"klo": "kalau", "bgs": "bagus", "masi": "masih", "msh": "masih", "lom": "belum",
"blm": "belum", "ap": "apa", "brg": "barang", "ad": "ada", "blom": "belum",
"kebli": "kebeli", "tp": "tapi", "org": "orang", "tdk": "tidak", "yg": "yang",
"kalo": "kalau", "sy": "saya", "bng": "abang", "bg": "abang", "fto": "foto",
"spek": "spesifikasi", "cm": "cuma", "jg": "juga", "pd": "pada", "skrg": "sekarang",
"ga": "tidak", "gk": "tidak", "batre": "baterai", "gue": "saya", "dpt": "dapat",
"kek": "seperti", "mna": "mana", "mnding": "mending", "mend": "mending",
"dr": "dari", "sma": "sama", "drpada": "daripada"
}
slang_dict = {**slang_dict_tambahan, **slang_dict_txt, **slang_dict_csv}
# Stopwords (Adjusted)
stpwds_id = list(set(stopwords.words('indonesian')))
retain_words = ['baru', 'lama', 'sama', 'tapi', 'tidak', 'dari', 'belum', 'bagi', 'mau', 'masalah']
for word in retain_words:
if word in stpwds_id:
stpwds_id.remove(word)
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
# Function to replace slang terms
def replace_slang_in_text(text, slang_dict):
words = text.split()
replaced_words = [slang_dict.get(word, word) for word in words]
return ' '.join(replaced_words)
def text_preprocessing(text, slang_dict):
# Case folding (convert text to lowercase)
text = text.lower()
# Remove mentions, hashtags, and newlines
text = re.sub(r"@[\w]+|#[\w]+|\n", " ", text)
# Remove URLs
text = re.sub(r"http\S+|www.\S+", " ", text)
# Remove non-alphabetic characters and extra spaces
text = re.sub(r"[^\w\s']", " ", text)
# Replace informal phrases
for phrase, replacement in informal_phrases.items():
text = text.replace(phrase, replacement)
# Replace slang terms
text = replace_slang_in_text(text, slang_dict)
# Tokenization
tokens = word_tokenize(text)
# Remove stopwords
tokens = [word for word in tokens if word not in stpwds_id]
# Lemmatization (optional, but can improve performance)
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Stemming with exceptions
stemming_exceptions = {"terasa": "terasa", "sat": "cepat", "set": "cepat"}
tokens = [stemming_exceptions.get(word, word) for word in tokens]
# Reassemble the text and remove duplicates
text = ' '.join(dict.fromkeys(tokens))
return text