Spaces:
Build error
Build error
Upload 52 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +7 -5
- app.py +10 -0
- combined_slang_words.txt +1 -0
- final_dataset_text_processed.csv +0 -0
- helper.py +164 -0
- home.py +41 -0
- images/.DS_Store +0 -0
- images/ASUS Zenfone 11 Ultra.jpg +0 -0
- images/Galaxy A06.jpg +0 -0
- images/Galaxy S24 Ultra.png +0 -0
- images/Galaxy S24+.jpeg +0 -0
- images/Galaxy S24.jpg +0 -0
- images/Galaxy S25 Slim.png +0 -0
- images/Galaxy S25 Ultra.png +0 -0
- images/Galaxy S25.png +0 -0
- images/Google Pixel 9.jpeg +0 -0
- images/Huawei Mate XT Ultimate.jpg +0 -0
- images/Huawei Nova Flip.png +0 -0
- images/Huawei Pura 70 Pro.png +0 -0
- images/Huawei Pura 70 Ultra.jpg +0 -0
- images/Huawei Pura 70.jpeg +0 -0
- images/Huawei Pura 80 Ultra.jpg +0 -0
- images/Infinix Zero 30.jpeg +0 -0
- images/OPPO A3x.jpg +0 -0
- images/OPPO Find X7.jpg +0 -0
- images/POCO M7 Pro 5G.png +0 -0
- images/Pixel 10 Pro.jpeg +0 -0
- images/Pixel 10.jpg +0 -0
- images/Pixel 9a.jpg +0 -0
- images/Realme 12 5G.jpg +0 -0
- images/Realme 12 Pro 5G.jpg +0 -0
- images/Realme 13 Series.jpg +0 -0
- images/Realme C61.jpeg +0 -0
- images/Redmi Note 13 5G.jpg +0 -0
- images/Redmi Note 13 Pro 5G.jpg +0 -0
- images/Redmi Note 14 Series.png +0 -0
- images/Sharp Aquos Sense 8.jpg +0 -0
- images/Tecno Spark 20c.png +0 -0
- images/Vivo V40.jpeg +0 -0
- images/Vivo X100 Pro.jpg +0 -0
- images/Xiaomi 14T Pro.jpg +0 -0
- images/Xiaomi 14T.png +0 -0
- images/Xiaomi 15 Pro.jpg +0 -0
- images/Xiaomi 15.png +0 -0
- images/iPhone 16.jpeg +0 -0
- images/iPhone 17 Air.jpg +0 -0
- images/iPhone 17.jpeg +0 -0
- images/iQOO Z8.jpeg +0 -0
- model_lstm.h5 +3 -0
- new_kamusalay.csv +0 -0
README.md
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: streamlit
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Phonalitics
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: streamlit
|
| 7 |
+
sdk_version: 1.40.1
|
| 8 |
+
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
st.set_page_config(layout="wide")
|
| 3 |
+
|
| 4 |
+
pg = st.navigation([
|
| 5 |
+
st.Page("home.py", title="Word Clouds", icon="🔥"),
|
| 6 |
+
st.Page("youtube.py", title="Youtube Comments Analysis", icon="💬"),
|
| 7 |
+
])
|
| 8 |
+
|
| 9 |
+
pg.run()
|
| 10 |
+
|
combined_slang_words.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"@": "di", "abis": "habis", "ad": "ada", "adlh": "adalah", "afaik": "as far as i know", "ahaha": "haha", "aj": "saja", "ajep-ajep": "dunia gemerlap", "ak": "saya", "akika": "aku", "akkoh": "aku", "akuwh": "aku", "alay": "norak", "alow": "halo", "ambilin": "ambilkan", "ancur": "hancur", "anjrit": "anjing", "anter": "antar", "ap2": "apa-apa", "apasih": "apa sih", "apes": "sial", "aps": "apa", "aq": "saya", "aquwh": "aku", "asbun": "asal bunyi", "aseekk": "asyik", "asekk": "asyik", "asem": "asam", "aspal": "asli tetapi palsu", "astul": "asal tulis", "ato": "atau", "au ah": "tidak mau tahu", "awak": "saya", "ay": "sayang", "ayank": "sayang", "b4": "sebelum", "bakalan": "akan", "bandes": "bantuan desa", "bangedh": "banget", "banpol": "bantuan polisi", "banpur": "bantuan tempur", "basbang": "basi", "bcanda": "bercanda", "bdg": "bandung", "begajulan": "nakal", "beliin": "belikan", "bencong": "banci", "bentar": "sebentar", "ber3": "bertiga", "beresin": "membereskan", "bete": "bosan", "beud": "banget", "bg": "abang", "bgmn": "bagaimana", "bgt": "banget", "bijimane": "bagaimana", "bintal": "bimbingan mental", "bkl": "akan", "bknnya": "bukannya", "blegug": "bodoh", "blh": "boleh", "bln": "bulan", "blum": "belum", "bnci": "benci", "bnran": "yang benar", "bodor": "lucu", "bokap": "ayah", "boker": "buang air besar", "bokis": "bohong", "boljug": "boleh juga", "bonek": "bocah nekat", "boyeh": "boleh", "br": "baru", "brg": "bareng", "bro": "saudara laki-laki", "bru": "baru", "bs": "bisa", "bsen": "bosan", "bt": "buat", "btw": "ngomong-ngomong", "buaya": "tidak setia", "bubbu": "tidur", "bubu": "tidur", "bumil": "ibu hamil", "bw": "bawa", "bwt": "buat", "byk": "banyak", "byrin": "bayarkan", "cabal": "sabar", "cadas": "keren", "calo": "makelar", "can": "belum", "capcus": "pergi", "caper": "cari perhatian", "ce": "cewek", "cekal": "cegah tangkal", "cemen": "penakut", "cengengesan": "tertawa", "cepet": "cepat", "cew": "cewek", "chuyunk": "sayang", "cimeng": "ganja", "cipika cipiki": "cium pipi kanan cium pipi kiri", "ciyh": "sih", "ckepp": "cakep", "ckp": "cakep", "cmiiw": "correct me if i'm wrong", "cmpur": "campur", "cong": "banci", "conlok": "cinta lokasi", "cowwyy": "maaf", "cp": "siapa", "cpe": "capek", "cppe": "capek", "cucok": "cocok", "cuex": "cuek", "cumi": "Cuma miscall", "cups": "culun", "curanmor": "pencurian kendaraan bermotor", "curcol": "curahan hati colongan", "cwek": "cewek", "cyin": "cinta", "d": "di", "dah": "deh", "dapet": "dapat", "de": "adik", "dek": "adik", "demen": "suka", "deyh": "deh", "dgn": "dengan", "diancurin": "dihancurkan", "dimaafin": "dimaafkan", "dimintak": "diminta", "disono": "di sana", "dket": "dekat", "dkk": "dan kawan-kawan", "dll": "dan lain-lain", "dlu": "dulu", "dngn": "dengan", "dodol": "bodoh", "doku": "uang", "dongs": "dong", "dpt": "dapat", "dri": "dari", "drmn": "darimana", "drtd": "dari tadi", "dst": "dan seterusnya", "dtg": "datang", "duh": "aduh", "duren": "durian", "ed": "edisi", "egp": "emang gue pikirin", "eke": "aku", "elu": "kamu", "emangnya": "memangnya", "emng": "memang", "endak": "tidak", "enggak": "tidak", "envy": "iri", "ex": "mantan", "fax": "facsimile", "fifo": "first in first out", "folbek": "follow back", "fyi": "sebagai informasi", "gaada": "tidak ada uang", "gag": "tidak", "gaje": "tidak jelas", "gak papa": "tidak apa-apa", "gan": "juragan", "gaptek": "gagap teknologi", "gatek": "gagap teknologi", "gawe": "kerja", "gbs": "tidak bisa", "gebetan": "orang yang disuka", "geje": "tidak jelas", "gepeng": "gelandangan dan pengemis", "ghiy": "lagi", "gile": "gila", "gimana": "bagaimana", "gino": "gigi nongol", "githu": "gitu", "gj": "tidak jelas", "gmana": "bagaimana", "gn": "begini", "goblok": "bodoh", "golput": "golongan putih", "gowes": "mengayuh sepeda", "gpny": "tidak punya", "gr": "gede rasa", "gretongan": "gratisan", "gtau": "tidak tahu", "gua": "saya", "guoblok": "goblok", "gw": "saya", "ha": "tertawa", "haha": "tertawa", "hallow": "halo", "hankam": "pertahanan dan keamanan", "hehe": "he", "helo": "halo", "hey": "hai", "hlm": "halaman", "hny": "hanya", "hoax": "isu bohong", "hr": "hari", "hrus": "harus", "hubdar": "perhubungan darat", "huff": "mengeluh", "hum": "rumah", "humz": "rumah", "ilang": "hilang", "ilfil": "tidak suka", "imho": "in my humble opinion", "imoetz": "imut", "item": "hitam", "itungan": "hitungan", "iye": "iya", "ja": "saja", "jadiin": "jadi", "jaim": "jaga image", "jayus": "tidak lucu", "jdi": "jadi", "jem": "jam", "jga": "juga", "jgnkan": "jangankan", "jir": "anjing", "jln": "jalan", "jomblo": "tidak punya pacar", "jubir": "juru bicara", "jutek": "galak", "k": "ke", "kab": "kabupaten", "kabor": "kabur", "kacrut": "kacau", "kadiv": "kepala divisi", "kagak": "tidak", "kalo": "kalau", "kampret": "sialan", "kamtibmas": "keamanan dan ketertiban masyarakat", "kamuwh": "kamu", "kanwil": "kantor wilayah", "karna": "karena", "kasubbag": "kepala subbagian", "katrok": "kampungan", "kayanya": "kayaknya", "kbr": "kabar", "kdu": "harus", "kec": "kecamatan", "kejurnas": "kejuaraan nasional", "kekeuh": "keras kepala", "kel": "kelurahan", "kemaren": "kemarin", "kepengen": "mau", "kepingin": "mau", "kepsek": "kepala sekolah", "kesbang": "kesatuan bangsa", "kesra": "kesejahteraan rakyat", "ketrima": "diterima", "kgiatan": "kegiatan", "kibul": "bohong", "kimpoi": "kawin", "kl": "kalau", "klianz": "kalian", "kloter": "kelompok terbang", "klw": "kalau", "km": "kamu", "kmps": "kampus", "kmrn": "kemarin", "knal": "kenal", "knp": "kenapa", "kodya": "kota madya", "komdis": "komisi disiplin", "komsov": "komunis sovyet", "kongkow": "kumpul bareng teman-teman", "kopdar": "kopi darat", "korup": "korupsi", "kpn": "kapan", "krenz": "keren", "krm": "kirim", "kt": "kita", "ktmu": "ketemu", "ktr": "kantor", "kuper": "kurang pergaulan", "kw": "imitasi", "kyk": "seperti", "la": "lah", "lam": "salam", "lamp": "lampiran", "lanud": "landasan udara", "latgab": "latihan gabungan", "lebay": "berlebihan", "leh": "boleh", "lelet": "lambat", "lemot": "lambat", "lgi": "lagi", "lgsg": "langsung", "liat": "lihat", "litbang": "penelitian dan pengembangan", "lmyn": "lumayan", "lo": "kamu", "loe": "kamu", "lola": "lambat berfikir", "louph": "cinta", "low": "kalau", "lp": "lupa", "luber": "langsung, umum, bebas, dan rahasia", "luchuw": "lucu", "lum": "belum", "luthu": "lucu", "lwn": "lawan", "maacih": "terima kasih", "mabal": "bolos", "macem": "macam", "macih": "masih", "maem": "makan", "magabut": "makan gaji buta", "maho": "homo", "mak jang": "kaget", "maksain": "memaksa", "malem": "malam", "mam": "makan", "maneh": "kamu", "maniez": "manis", "mao": "mau", "masukin": "masukkan", "melu": "ikut", "mepet": "dekat sekali", "mgu": "minggu", "migas": "minyak dan gas bumi", "mikol": "minuman beralkohol", "miras": "minuman keras", "mlah": "malah", "mngkn": "mungkin", "mo": "mau", "mokad": "mati", "moso": "masa", "mpe": "sampai", "msk": "masuk", "mslh": "masalah", "mt": "makan teman", "mubes": "musyawarah besar", "mulu": "melulu", "mumpung": "selagi", "munas": "musyawarah nasional", "muntaber": "muntah dan berak", "musti": "mesti", "muupz": "maaf", "mw": "now watching", "n": "dan", "nanam": "menanam", "nanya": "bertanya", "napa": "kenapa", "napi": "narapidana", "napza": "narkotika, alkohol, psikotropika, dan zat adiktif ", "narkoba": "narkotika, psikotropika, dan obat terlarang", "nasgor": "nasi goreng", "nda": "tidak", "ndiri": "sendiri", "ne": "ini", "nekolin": "neokolonialisme", "nembak": "menyatakan cinta", "ngabuburit": "menunggu berbuka puasa", "ngaku": "mengaku", "ngambil": "mengambil", "nganggur": "tidak punya pekerjaan", "ngapah": "kenapa", "ngaret": "terlambat", "ngasih": "memberikan", "ngebandel": "berbuat bandel", "ngegosip": "bergosip", "ngeklaim": "mengklaim", "ngeksis": "menjadi eksis", "ngeles": "berkilah", "ngelidur": "menggigau", "ngerampok": "merampok", "ngga": "tidak", "ngibul": "berbohong", "ngiler": "mau", "ngiri": "iri", "ngisiin": "mengisikan", "ngmng": "bicara", "ngomong": "bicara", "ngubek2": "mencari-cari", "ngurus": "mengurus", "nie": "ini", "nih": "ini", "niyh": "nih", "nmr": "nomor", "nntn": "nonton", "nobar": "nonton bareng", "np": "now playing", "ntar": "nanti", "ntn": "nonton", "numpuk": "bertumpuk", "nutupin": "menutupi", "nyari": "mencari", "nyekar": "menyekar", "nyicil": "mencicil", "nyoblos": "mencoblos", "nyokap": "ibu", "ogah": "tidak mau", "ol": "online", "ongkir": "ongkos kirim", "oot": "out of topic", "org2": "orang-orang", "ortu": "orang tua", "otda": "otonomi daerah", "otw": "on the way, sedang di jalan", "pacal": "pacar", "pake": "pakai", "pala": "kepala", "pansus": "panitia khusus", "parpol": "partai politik", "pasutri": "pasangan suami istri", "pd": "pada", "pede": "percaya diri", "pelatnas": "pemusatan latihan nasional", "pemda": "pemerintah daerah", "pemkot": "pemerintah kota", "pemred": "pemimpin redaksi", "penjas": "pendidikan jasmani", "perda": "peraturan daerah", "perhatiin": "perhatikan", "pesenan": "pesanan", "pgang": "pegang", "pi": "tapi", "pilkada": "pemilihan kepala daerah", "pisan": "sangat", "pk": "penjahat kelamin", "plg": "paling", "pmrnth": "pemerintah", "polantas": "polisi lalu lintas", "ponpes": "pondok pesantren", "pp": "pulang pergi", "prg": "pergi", "prnh": "pernah", "psen": "pesan", "pst": "pasti", "pswt": "pesawat", "pw": "posisi nyaman", "qmu": "kamu", "rakor": "rapat koordinasi", "ranmor": "kendaraan bermotor", "re": "reply", "ref": "referensi", "rehab": "rehabilitasi", "rempong": "sulit", "repp": "balas", "restik": "reserse narkotika", "rhs": "rahasia", "rmh": "rumah", "ru": "baru", "ruko": "rumah toko", "rusunawa": "rumah susun sewa", "ruz": "terus", "saia": "saya", "salting": "salah tingkah", "sampe": "sampai", "samsek": "sama sekali", "sapose": "siapa", "satpam": "satuan pengamanan", "sbb": "sebagai berikut", "sbh": "sebuah", "sbnrny": "sebenarnya", "scr": "secara", "sdgkn": "sedangkan", "sdkt": "sedikit", "se7": "setuju", "sebelas dua belas": "mirip", "sembako": "sembilan bahan pokok", "sempet": "sempat", "sendratari": "seni drama tari", "sgt": "sangat", "shg": "sehingga", "siech": "sih", "sikon": "situasi dan kondisi", "sinetron": "sinema elektronik", "siramin": "siramkan", "sj": "saja", "skalian": "sekalian", "sklh": "sekolah", "skt": "sakit", "slesai": "selesai", "sll": "selalu", "slma": "selama", "slsai": "selesai", "smpt": "sempat", "smw": "semua", "sndiri": "sendiri", "soljum": "sholat jumat", "songong": "sombong", "sory": "maaf", "sosek": "sosial-ekonomi", "sotoy": "sok tahu", "spa": "siapa", "sppa": "siapa", "spt": "seperti", "srtfkt": "sertifikat", "stiap": "setiap", "stlh": "setelah", "suk": "masuk", "sumpek": "sempit", "syg": "sayang", "t4": "tempat", "tajir": "kaya", "tau": "tahu", "taw": "tahu", "td": "tadi", "tdk": "tidak", "teh": "kakak perempuan", "telat": "terlambat", "telmi": "telat berpikir", "temen": "teman", "tengil": "menyebalkan", "tepar": "terkapar", "tggu": "tunggu", "tgu": "tunggu", "thankz": "terima kasih", "thn": "tahun", "tilang": "bukti pelanggaran", "tipiwan": "TvOne", "tks": "terima kasih", "tlp": "telepon", "tls": "tulis", "tmbah": "tambah", "tmen2": "teman-teman", "tmpah": "tumpah", "tmpt": "tempat", "tngu": "tunggu", "tnyta": "ternyata", "tokai": "tai", "toserba": "toko serba ada", "tpi": "tapi", "trdhulu": "terdahulu", "trima": "terima kasih", "trm": "terima", "trs": "terus", "trutama": "terutama", "ts": "penulis", "tst": "tahu sama tahu", "ttg": "tentang", "tuch": "tuh", "tuir": "tua", "tw": "tahu", "u": "kamu", "ud": "sudah", "udah": "sudah", "ujg": "ujung", "ul": "ulangan", "unyu": "lucu", "uplot": "unggah", "urang": "saya", "usah": "perlu", "utk": "untuk", "valas": "valuta asing", "w/": "dengan", "wadir": "wakil direktur", "wamil": "wajib militer", "warkop": "warung kopi", "warteg": "warung tegal", "wat": "buat", "wkt": "waktu", "wtf": "what the fuck", "xixixi": "tertawa", "ya": "iya", "yap": "iya", "yaudah": "ya sudah", "yawdah": "ya sudah", "yg": "yang", "yl": "yang lain", "yo": "iya", "yowes": "ya sudah", "yup": "iya", "7an": "tujuan", "ababil": "abg labil", "acc": "accord", "adlah": "adalah", "adoh": "aduh", "aha": "tertawa", "aing": "saya", "aja": "saja", "ajj": "saja", "aka": "dikenal juga sebagai", "akko": "aku", "akku": "aku", "akyu": "aku", "aljasa": "asal jadi saja", "ama": "sama", "ambl": "ambil", "anjir": "anjing", "ank": "anak", "ap": "apa", "apaan": "apa", "ape": "apa", "aplot": "unggah", "apva": "apa", "aqu": "aku", "asap": "sesegera mungkin", "aseek": "asyik", "asek": "asyik", "aseknya": "asyiknya", "asoy": "asyik", "astrojim": "astagfirullahaladzim", "ath": "kalau begitu", "atuh": "kalau begitu", "ava": "avatar", "aws": "awas", "ayang": "sayang", "ayok": "ayo", "bacot": "banyak bicara", "bales": "balas", "bangdes": "pembangunan desa", "bangkotan": "tua", "banpres": "bantuan presiden", "bansarkas": "bantuan sarana kesehatan", "bazis": "badan amal, zakat, infak, dan sedekah", "bcoz": "karena", "beb": "sayang", "bejibun": "banyak", "belom": "belum", "bener": "benar", "ber2": "berdua", "berdikari": "berdiri di atas kaki sendiri", "bet": "banget", "beti": "beda tipis", "beut": "banget", "bgd": "banget", "bgs": "bagus", "bhubu": "tidur", "bimbuluh": "bimbingan dan penyuluhan", "bisi": "kalau-kalau", "bkn": "bukan", "bl": "beli", "blg": "bilang", "blm": "belum", "bls": "balas", "bnchi": "benci", "bngung": "bingung", "bnyk": "banyak", "bohay": "badan aduhai", "bokep": "porno", "bokin": "pacar", "bole": "boleh", "bolot": "bodoh", "bonyok": "ayah ibu", "bpk": "bapak", "brb": "segera kembali", "brngkt": "berangkat", "brp": "berapa", "brur": "saudara laki-laki", "bsa": "bisa", "bsk": "besok", "bu_bu": "tidur", "bubarin": "bubarkan", "buber": "buka bersama", "bujubune": "luar biasa", "buser": "buru sergap", "bwhn": "bawahan", "byar": "bayar", "byr": "bayar", "c8": "chat", "cabut": "pergi", "caem": "cakep", "cama-cama": "sama-sama", "cangcut": "celana dalam", "cape": "capek", "caur": "jelek", "cekak": "tidak ada uang", "cekidot": "coba lihat", "cemplungin": "cemplungkan", "ceper": "pendek", "ceu": "kakak perempuan", "cewe": "cewek", "cibuk": "sibuk", "cin": "cinta", "ciye": "cie", "ckck": "ck", "clbk": "cinta lama bersemi kembali", "cmpr": "campur", "cnenk": "senang", "congor": "mulut", "cow": "cowok", "coz": "karena", "cpa": "siapa", "gokil": "gila", "gombal": "suka merayu", "gpl": "tidak pakai lama", "gpp": "tidak apa-apa", "gretong": "gratis", "gt": "begitu", "gtw": "tidak tahu", "gue": "saya", "guys": "teman-teman", "gws": "cepat sembuh", "haghaghag": "tertawa", "hakhak": "tertawa", "handak": "bahan peledak", "hansip": "pertahanan sipil", "hellow": "halo", "helow": "halo", "hi": "hai", "hlng": "hilang", "hnya": "hanya", "houm": "rumah", "hrs": "harus", "hubad": "hubungan angkatan darat", "hubla": "perhubungan laut", "huft": "mengeluh", "humas": "hubungan masyarakat", "idk": "saya tidak tahu", "ilfeel": "tidak suka", "imba": "jago sekali", "imoet": "imut", "info": "informasi", "itung": "hitung", "isengin": "bercanda", "iyala": "iya lah", "iyo": "iya", "jablay": "jarang dibelai", "jadul": "jaman dulu", "jancuk": "anjing", "jd": "jadi", "jdikan": "jadikan", "jg": "juga", "jgn": "jangan", "jijay": "jijik", "jkt": "jakarta", "jnj": "janji", "jth": "jatuh", "jurdil": "jujur adil", "jwb": "jawab", "ka": "kakak", "kabag": "kepala bagian", "kacian": "kasihan", "kadit": "kepala direktorat", "kaga": "tidak", "kaka": "kakak", "kamtib": "keamanan dan ketertiban", "kamuh": "kamu", "kamyu": "kamu", "kapt": "kapten", "kasat": "kepala satuan", "kasubbid": "kepala subbidang", "kau": "kamu", "kbar": "kabar", "kcian": "kasihan", "keburu": "terlanjur", "kedubes": "kedutaan besar", "kek": "seperti", "keknya": "kayaknya", "keliatan": "kelihatan", "keneh": "masih", "kepikiran": "terpikirkan", "kepo": "mau tahu urusan orang", "kere": "tidak punya uang", "kesian": "kasihan", "ketauan": "ketahuan", "keukeuh": "keras kepala", "khan": "kan", "kibus": "kaki busuk", "kk": "kakak", "klian": "kalian", "klo": "kalau", "kluarga": "keluarga", "klwrga": "keluarga", "kmari": "kemari", "kmpus": "kampus", "kn": "kan", "knl": "kenal", "knpa": "kenapa", "kog": "kok", "kompi": "komputer", "komtiong": "komunis Tiongkok", "konjen": "konsulat jenderal", "koq": "kok", "kpd": "kepada", "kptsan": "keputusan", "krik": "garing", "krn": "karena", "ktauan": "ketahuan", "ktny": "katanya", "kudu": "harus", "kuq": "kok", "ky": "seperti", "kykny": "kayanya", "laka": "kecelakaan", "lambreta": "lambat", "lansia": "lanjut usia", "lapas": "lembaga pemasyarakatan", "lbur": "libur", "lekong": "laki-laki", "lg": "lagi", "lgkp": "lengkap", "lht": "lihat", "linmas": "perlindungan masyarakat", "lmyan": "lumayan", "lngkp": "lengkap", "loch": "loh", "lol": "tertawa", "lom": "belum", "loupz": "cinta", "lowh": "kamu", "lu": "kamu", "luchu": "lucu", "luff": "cinta", "luph": "cinta", "lw": "kamu", "lwt": "lewat", "maaciw": "terima kasih", "mabes": "markas besar", "macem-macem": "macam-macam", "madesu": "masa depan suram", "maen": "main", "mahatma": "maju sehat bersama", "mak": "ibu", "makasih": "terima kasih", "malah": "bahkan", "malu2in": "memalukan", "mamz": "makan", "manies": "manis", "mantep": "mantap", "markus": "makelar kasus", "mba": "mbak", "mending": "lebih baik", "mgkn": "mungkin", "mhn": "mohon", "miker": "minuman keras", "milis": "mailing list", "mksd": "maksud", "mls": "malas", "mnt": "minta", "moge": "motor gede", "mokat": "mati", "mosok": "masa", "msh": "masih", "mskpn": "meskipun", "msng2": "masing-masing", "muahal": "mahal", "muker": "musyawarah kerja", "mumet": "pusing", "muna": "munafik", "munaslub": "musyawarah nasional luar biasa", "musda": "musyawarah daerah", "muup": "maaf", "muuv": "maaf", "nal": "kenal", "nangis": "menangis", "naon": "apa", "napol": "narapidana politik", "naq": "anak", "narsis": "bangga pada diri sendiri", "nax": "anak", "ndak": "tidak", "ndut": "gendut", "nekolim": "neokolonialisme", "nelfon": "menelepon", "ngabis2in": "menghabiskan", "ngakak": "tertawa", "ngambek": "marah", "ngampus": "pergi ke kampus", "ngantri": "mengantri", "ngapain": "sedang apa", "ngaruh": "berpengaruh", "ngawur": "berbicara sembarangan", "ngeceng": "kumpul bareng-bareng", "ngeh": "sadar", "ngekos": "tinggal di kos", "ngelamar": "melamar", "ngeliat": "melihat", "ngemeng": "bicara terus-terusan", "ngerti": "mengerti", "nggak": "tidak", "ngikut": "ikut", "nginep": "menginap", "ngisi": "mengisi", "ngmg": "bicara", "ngocol": "lucu", "ngomongin": "membicarakan", "ngumpul": "berkumpul", "ni": "ini", "nyasar": "tersesat", "nyariin": "mencari", "nyiapin": "mempersiapkan", "nyiram": "menyiram", "nyok": "ayo", "o/": "oleh", "ok": "ok", "priksa": "periksa", "pro": "profesional", "psn": "pesan", "psti": "pasti", "puanas": "panas", "qmo": "kamu", "qt": "kita", "rame": "ramai", "raskin": "rakyat miskin", "red": "redaksi", "reg": "register", "rejeki": "rezeki", "renstra": "rencana strategis", "reskrim": "reserse kriminal", "sni": "sini", "somse": "sombong sekali", "sorry": "maaf", "sosbud": "sosial-budaya", "sospol": "sosial-politik", "sowry": "maaf", "spd": "sepeda", "sprti": "seperti", "spy": "supaya", "stelah": "setelah", "subbag": "subbagian", "sumbangin": "sumbangkan", "sy": "saya", "syp": "siapa", "tabanas": "tabungan pembangunan nasional", "tar": "nanti", "taun": "tahun", "tawh": "tahu", "tdi": "tadi", "te2p": "tetap", "tekor": "rugi", "telkom": "telekomunikasi", "telp": "telepon", "temen2": "teman-teman", "tengok": "menjenguk", "terbitin": "terbitkan", "tgl": "tanggal", "thanks": "terima kasih", "thd": "terhadap", "thx": "terima kasih", "tipi": "TV", "tkg": "tukang", "tll": "terlalu", "tlpn": "telepon", "tman": "teman", "tmbh": "tambah", "tmn2": "teman-teman", "tmph": "tumpah", "tnda": "tanda", "tnh": "tanah", "togel": "toto gelap", "tp": "tapi", "tq": "terima kasih", "trgntg": "tergantung", "trims": "terima kasih", "cb": "coba", "y": "ya", "munfik": "munafik", "reklamuk": "reklamasi", "sma": "sama", "tren": "trend", "ngehe": "kesal", "mz": "mas", "analisise": "analisis", "sadaar": "sadar", "sept": "september", "nmenarik": "menarik", "zonk": "bodoh", "rights": "benar", "simiskin": "miskin", "ngumpet": "sembunyi", "hardcore": "keras", "akhirx": "akhirnya", "solve": "solusi", "watuk": "batuk", "ngebully": "intimidasi", "masy": "masyarakat", "still": "masih", "tauk": "tahu", "mbual": "bual", "tioghoa": "tionghoa", "ngentotin": "senggama", "kentot": "senggama", "faktakta": "fakta", "sohib": "teman", "rubahnn": "rubah", "trlalu": "terlalu", "nyela": "cela", "heters": "pembenci", "nyembah": "sembah", "most": "paling", "ikon": "lambang", "light": "terang", "pndukung": "pendukung", "setting": "atur", "seting": "akting", "next": "lanjut", "waspadalah": "waspada", "gantengsaya": "ganteng", "parte": "partai", "nyerang": "serang", "nipu": "tipu", "ktipu": "tipu", "jentelmen": "berani", "buangbuang": "buang", "tsangka": "tersangka", "kurng": "kurang", "ista": "nista", "less": "kurang", "koar": "teriak", "paranoid": "takut", "problem": "masalah", "tahi": "kotoran", "tirani": "tiran", "tilep": "tilap", "happy": "bahagia", "tak": "tidak", "penertiban": "tertib", "uasai": "kuasa", "mnolak": "tolak", "trending": "trend", "taik": "tahi", "wkwkkw": "tertawa", "ahokncc": "ahok", "istaa": "nista", "benarjujur": "jujur", "mgkin": "mungkin"}
|
final_dataset_text_processed.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
helper.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
from googleapiclient.discovery import build
|
| 5 |
+
import nltk
|
| 6 |
+
from nltk.tokenize import word_tokenize
|
| 7 |
+
import json
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from nltk.stem import WordNetLemmatizer
|
| 10 |
+
from nltk.corpus import stopwords
|
| 11 |
+
|
| 12 |
+
nltk.download('averaged_perceptron_tagger')
|
| 13 |
+
|
| 14 |
+
nltk.download('stopwords')
|
| 15 |
+
nltk.download('punkt')
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
api_key = os.getenv('API_KEY')
|
| 20 |
+
youtube = build('youtube', 'v3', developerKey=api_key)
|
| 21 |
+
|
| 22 |
+
def get_all_comments(video_id):
|
| 23 |
+
comments = []
|
| 24 |
+
next_page_token = None
|
| 25 |
+
|
| 26 |
+
while True:
|
| 27 |
+
# Make API call to get comments
|
| 28 |
+
request = youtube.commentThreads().list(
|
| 29 |
+
part='snippet',
|
| 30 |
+
videoId=video_id,
|
| 31 |
+
textFormat='plainText',
|
| 32 |
+
pageToken=next_page_token # Use pagination token for next set of comments
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Execute the request
|
| 36 |
+
response = request.execute()
|
| 37 |
+
|
| 38 |
+
# Loop through the comments in the response
|
| 39 |
+
for item in response['items']:
|
| 40 |
+
comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
|
| 41 |
+
author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
|
| 42 |
+
timestamp = item['snippet']['topLevelComment']['snippet']['publishedAt']
|
| 43 |
+
like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
|
| 44 |
+
comments.append({
|
| 45 |
+
'author': author.strip(),
|
| 46 |
+
'comment': comment.strip(),
|
| 47 |
+
'timestamp': timestamp.strip(),
|
| 48 |
+
'like_count': like_count,
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
# Check if there's another page of comments (pagination)
|
| 52 |
+
next_page_token = response.get('nextPageToken')
|
| 53 |
+
|
| 54 |
+
if not next_page_token or len(comments) >= 100: # If no more pages, break the loop
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
return comments
|
| 58 |
+
|
| 59 |
+
def extract_youtube_id(url_or_id):
|
| 60 |
+
pattern = r'(?:v=|\/)([a-zA-Z0-9_-]{11})(?:&|$)?'
|
| 61 |
+
|
| 62 |
+
if re.fullmatch(r'[a-zA-Z0-9_-]{11}', url_or_id):
|
| 63 |
+
return url_or_id
|
| 64 |
+
|
| 65 |
+
match = re.search(pattern, url_or_id)
|
| 66 |
+
if match:
|
| 67 |
+
return match.group(1)
|
| 68 |
+
return None
|
| 69 |
+
|
| 70 |
+
informal_phrases = {
|
| 71 |
+
"sat set sat set": "cepat", "ya mas": ""
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
def load_slang_txt(file_path):
|
| 75 |
+
slang_dict_txt = {}
|
| 76 |
+
try:
|
| 77 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 78 |
+
file_content = file.read()
|
| 79 |
+
slang_dict_txt = json.loads(file_content)
|
| 80 |
+
except json.JSONDecodeError:
|
| 81 |
+
print(f"Error decoding JSON in the file: {file_path}")
|
| 82 |
+
return slang_dict_txt
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def load_slang_csv(file_path):
|
| 86 |
+
slang_df = pd.read_csv(file_path, encoding='ISO-8859-1')
|
| 87 |
+
return dict(zip(slang_df.iloc[:, 0], slang_df.iloc[:, 1]))
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# Combine slang dictionaries
|
| 91 |
+
slang_txt_path = 'combined_slang_words.txt'
|
| 92 |
+
slang_dict_txt = load_slang_txt(slang_txt_path)
|
| 93 |
+
|
| 94 |
+
slang_csv_path = 'new_kamusalay.csv'
|
| 95 |
+
slang_dict_csv = load_slang_csv(slang_csv_path)
|
| 96 |
+
|
| 97 |
+
slang_dict_tambahan = {
|
| 98 |
+
"gw": "saya", "mau": "ingin", "ni": "ini", "aja": "saja", "gak": "tidak", "bgt": "sangat",
|
| 99 |
+
"klo": "kalau", "bgs": "bagus", "masi": "masih", "msh": "masih", "lom": "belum",
|
| 100 |
+
"blm": "belum", "ap": "apa", "brg": "barang", "ad": "ada", "blom": "belum",
|
| 101 |
+
"kebli": "kebeli", "tp": "tapi", "org": "orang", "tdk": "tidak", "yg": "yang",
|
| 102 |
+
"kalo": "kalau", "sy": "saya", "bng": "abang", "bg": "abang", "fto": "foto",
|
| 103 |
+
"spek": "spesifikasi", "cm": "cuma", "jg": "juga", "pd": "pada", "skrg": "sekarang",
|
| 104 |
+
"ga": "tidak", "gk": "tidak", "batre": "baterai", "gue": "saya", "dpt": "dapat",
|
| 105 |
+
"kek": "seperti", "mna": "mana", "mnding": "mending", "mend": "mending",
|
| 106 |
+
"dr": "dari", "sma": "sama", "drpada": "daripada"
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
slang_dict = {**slang_dict_tambahan, **slang_dict_txt, **slang_dict_csv}
|
| 110 |
+
|
| 111 |
+
# Stopwords (Adjusted)
|
| 112 |
+
stpwds_id = list(set(stopwords.words('indonesian')))
|
| 113 |
+
retain_words = ['baru', 'lama', 'sama', 'tapi', 'tidak', 'dari', 'belum', 'bagi', 'mau', 'masalah']
|
| 114 |
+
for word in retain_words:
|
| 115 |
+
if word in stpwds_id:
|
| 116 |
+
stpwds_id.remove(word)
|
| 117 |
+
|
| 118 |
+
# Initialize Lemmatizer
|
| 119 |
+
lemmatizer = WordNetLemmatizer()
|
| 120 |
+
|
| 121 |
+
# Function to replace slang terms
|
| 122 |
+
def replace_slang_in_text(text, slang_dict):
|
| 123 |
+
words = text.split()
|
| 124 |
+
replaced_words = [slang_dict.get(word, word) for word in words]
|
| 125 |
+
return ' '.join(replaced_words)
|
| 126 |
+
|
| 127 |
+
def text_preprocessing(text, slang_dict):
|
| 128 |
+
# Case folding (convert text to lowercase)
|
| 129 |
+
text = text.lower()
|
| 130 |
+
|
| 131 |
+
# Remove mentions, hashtags, and newlines
|
| 132 |
+
text = re.sub(r"@[\w]+|#[\w]+|\n", " ", text)
|
| 133 |
+
|
| 134 |
+
# Remove URLs
|
| 135 |
+
text = re.sub(r"http\S+|www.\S+", " ", text)
|
| 136 |
+
|
| 137 |
+
# Remove non-alphabetic characters and extra spaces
|
| 138 |
+
text = re.sub(r"[^\w\s']", " ", text)
|
| 139 |
+
|
| 140 |
+
# Replace informal phrases
|
| 141 |
+
for phrase, replacement in informal_phrases.items():
|
| 142 |
+
text = text.replace(phrase, replacement)
|
| 143 |
+
|
| 144 |
+
# Replace slang terms
|
| 145 |
+
text = replace_slang_in_text(text, slang_dict)
|
| 146 |
+
|
| 147 |
+
# Tokenization
|
| 148 |
+
tokens = word_tokenize(text)
|
| 149 |
+
|
| 150 |
+
# Remove stopwords
|
| 151 |
+
tokens = [word for word in tokens if word not in stpwds_id]
|
| 152 |
+
|
| 153 |
+
# Lemmatization (optional, but can improve performance)
|
| 154 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
| 155 |
+
|
| 156 |
+
# Stemming with exceptions
|
| 157 |
+
stemming_exceptions = {"terasa": "terasa", "sat": "cepat", "set": "cepat"}
|
| 158 |
+
tokens = [stemming_exceptions.get(word, word) for word in tokens]
|
| 159 |
+
|
| 160 |
+
# Reassemble the text and remove duplicates
|
| 161 |
+
text = ' '.join(dict.fromkeys(tokens))
|
| 162 |
+
|
| 163 |
+
return text
|
| 164 |
+
|
home.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from wordcloud import WordCloud
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
st.header("🔥 Phone Brands Word Clouds")
|
| 10 |
+
|
| 11 |
+
data = pd.read_csv("final_dataset_text_processed.csv")
|
| 12 |
+
data.dropna(inplace=True)
|
| 13 |
+
val = st.sidebar.pills("Choose Phone Word Clouds To Show", data["tipe_produk"].unique(), selection_mode="multi", default="Galaxy S24")
|
| 14 |
+
for the_value in val:
|
| 15 |
+
the_product = data[data["tipe_produk"] == the_value]
|
| 16 |
+
if not the_product.empty:
|
| 17 |
+
text_data = " ".join(data["text_processed"])
|
| 18 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)
|
| 19 |
+
regex = re.compile(r"^(.*)\.[^.]+$")
|
| 20 |
+
col1, col2 = st.columns([2.5, 7.5])
|
| 21 |
+
image = None
|
| 22 |
+
|
| 23 |
+
for item in os.listdir("images"):
|
| 24 |
+
item_cleaned = regex.match(item).group(1)
|
| 25 |
+
if item_cleaned.lower().strip() == the_value.lower().strip():
|
| 26 |
+
image = Image.open(f"./images/{item}")
|
| 27 |
+
break
|
| 28 |
+
with col1:
|
| 29 |
+
if image:
|
| 30 |
+
st.markdown(f"### {the_value}")
|
| 31 |
+
st.image(image, caption=f"{the_value} Image")
|
| 32 |
+
|
| 33 |
+
with col2:
|
| 34 |
+
st.markdown(f"### Word Cloud for {the_value}")
|
| 35 |
+
|
| 36 |
+
plt.figure(figsize=(10, 5))
|
| 37 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
| 38 |
+
plt.axis("off")
|
| 39 |
+
st.pyplot(plt)
|
| 40 |
+
|
| 41 |
+
st.write('<div style="padding: 10px;"></div>', unsafe_allow_html=True)
|
images/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
images/ASUS Zenfone 11 Ultra.jpg
ADDED
|
images/Galaxy A06.jpg
ADDED
|
images/Galaxy S24 Ultra.png
ADDED
|
images/Galaxy S24+.jpeg
ADDED
|
images/Galaxy S24.jpg
ADDED
|
images/Galaxy S25 Slim.png
ADDED
|
images/Galaxy S25 Ultra.png
ADDED
|
images/Galaxy S25.png
ADDED
|
images/Google Pixel 9.jpeg
ADDED
|
|
images/Huawei Mate XT Ultimate.jpg
ADDED
|
images/Huawei Nova Flip.png
ADDED
|
images/Huawei Pura 70 Pro.png
ADDED
|
images/Huawei Pura 70 Ultra.jpg
ADDED
|
images/Huawei Pura 70.jpeg
ADDED
|
images/Huawei Pura 80 Ultra.jpg
ADDED
|
images/Infinix Zero 30.jpeg
ADDED
|
images/OPPO A3x.jpg
ADDED
|
images/OPPO Find X7.jpg
ADDED
|
images/POCO M7 Pro 5G.png
ADDED
|
images/Pixel 10 Pro.jpeg
ADDED
|
|
images/Pixel 10.jpg
ADDED
|
|
images/Pixel 9a.jpg
ADDED
|
|
images/Realme 12 5G.jpg
ADDED
|
images/Realme 12 Pro 5G.jpg
ADDED
|
images/Realme 13 Series.jpg
ADDED
|
images/Realme C61.jpeg
ADDED
|
images/Redmi Note 13 5G.jpg
ADDED
|
images/Redmi Note 13 Pro 5G.jpg
ADDED
|
images/Redmi Note 14 Series.png
ADDED
|
images/Sharp Aquos Sense 8.jpg
ADDED
|
images/Tecno Spark 20c.png
ADDED
|
images/Vivo V40.jpeg
ADDED
|
images/Vivo X100 Pro.jpg
ADDED
|
images/Xiaomi 14T Pro.jpg
ADDED
|
images/Xiaomi 14T.png
ADDED
|
images/Xiaomi 15 Pro.jpg
ADDED
|
images/Xiaomi 15.png
ADDED
|
images/iPhone 16.jpeg
ADDED
|
images/iPhone 17 Air.jpg
ADDED
|
images/iPhone 17.jpeg
ADDED
|
images/iQOO Z8.jpeg
ADDED
|
model_lstm.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2b310e48ef93270b6104d17b37f2347a012bf0350b3370611705406827f7e68
|
| 3 |
+
size 133
|
new_kamusalay.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|