Spaces:

Phonalitics
/

README

Build error

File size: 5,379 Bytes

c13c0a1

import re
from dotenv import load_dotenv
import os
from googleapiclient.discovery import build
import nltk
from nltk.tokenize import word_tokenize
import json
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('averaged_perceptron_tagger')

nltk.download('stopwords')
nltk.download('punkt')

load_dotenv()

api_key = os.getenv('API_KEY')
youtube = build('youtube', 'v3', developerKey=api_key)

def get_all_comments(video_id):
    comments = []
    next_page_token = None

    while True:
        # Make API call to get comments
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            pageToken=next_page_token  # Use pagination token for next set of comments
        )
        
        # Execute the request
        response = request.execute()

        # Loop through the comments in the response
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            timestamp = item['snippet']['topLevelComment']['snippet']['publishedAt']
            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
            comments.append({
                'author': author.strip(),
                'comment': comment.strip(),
                'timestamp': timestamp.strip(),
                'like_count': like_count,
            })
        
        # Check if there's another page of comments (pagination)
        next_page_token = response.get('nextPageToken')

        if not next_page_token or len(comments) >= 100:  # If no more pages, break the loop
          break

    return comments

def extract_youtube_id(url_or_id):
    pattern = r'(?:v=|\/)([a-zA-Z0-9_-]{11})(?:&|$)?'
    
    if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url_or_id):
        return url_or_id
    
    match = re.search(pattern, url_or_id)
    if match:
        return match.group(1)
    return None  

informal_phrases = {
    "sat set sat set": "cepat", "ya mas": ""
}

def load_slang_txt(file_path):
    slang_dict_txt = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            slang_dict_txt = json.loads(file_content)
    except json.JSONDecodeError:
        print(f"Error decoding JSON in the file: {file_path}")
    return slang_dict_txt


def load_slang_csv(file_path):
    slang_df = pd.read_csv(file_path, encoding='ISO-8859-1')
    return dict(zip(slang_df.iloc[:, 0], slang_df.iloc[:, 1]))


# Combine slang dictionaries
slang_txt_path = 'combined_slang_words.txt'
slang_dict_txt = load_slang_txt(slang_txt_path)

slang_csv_path = 'new_kamusalay.csv'
slang_dict_csv = load_slang_csv(slang_csv_path)

slang_dict_tambahan = {
    "gw": "saya", "mau": "ingin", "ni": "ini", "aja": "saja", "gak": "tidak", "bgt": "sangat",
    "klo": "kalau", "bgs": "bagus", "masi": "masih", "msh": "masih", "lom": "belum",
    "blm": "belum", "ap": "apa", "brg": "barang", "ad": "ada", "blom": "belum",
    "kebli": "kebeli", "tp": "tapi", "org": "orang", "tdk": "tidak", "yg": "yang",
    "kalo": "kalau", "sy": "saya", "bng": "abang", "bg": "abang", "fto": "foto",
    "spek": "spesifikasi", "cm": "cuma", "jg": "juga", "pd": "pada", "skrg": "sekarang",
    "ga": "tidak", "gk": "tidak", "batre": "baterai", "gue": "saya", "dpt": "dapat",
    "kek": "seperti", "mna": "mana", "mnding": "mending", "mend": "mending",
    "dr": "dari", "sma": "sama", "drpada": "daripada"
}

slang_dict = {**slang_dict_tambahan, **slang_dict_txt, **slang_dict_csv}

# Stopwords (Adjusted)
stpwds_id = list(set(stopwords.words('indonesian')))
retain_words = ['baru', 'lama', 'sama', 'tapi', 'tidak', 'dari', 'belum', 'bagi', 'mau', 'masalah']
for word in retain_words:
    if word in stpwds_id:
        stpwds_id.remove(word)

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to replace slang terms
def replace_slang_in_text(text, slang_dict):
    words = text.split()
    replaced_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(replaced_words)

def text_preprocessing(text, slang_dict):
    # Case folding (convert text to lowercase)
    text = text.lower()

    # Remove mentions, hashtags, and newlines
    text = re.sub(r"@[\w]+|#[\w]+|\n", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", " ", text)

    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r"[^\w\s']", " ", text)

    # Replace informal phrases
    for phrase, replacement in informal_phrases.items():
        text = text.replace(phrase, replacement)

    # Replace slang terms
    text = replace_slang_in_text(text, slang_dict)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stpwds_id]

    # Lemmatization (optional, but can improve performance)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Stemming with exceptions
    stemming_exceptions = {"terasa": "terasa", "sat": "cepat", "set": "cepat"}
    tokens = [stemming_exceptions.get(word, word) for word in tokens]

    # Reassemble the text and remove duplicates
    text = ' '.join(dict.fromkeys(tokens))

    return text