Spaces:
Running
Running
| """ | |
| PhilVerify β Labeled Dataset for XLM-RoBERTa Fine-tuning (Phase 10) | |
| 100 labeled PH-news samples across three classes: | |
| 0 = Credible (verifiable, sourced reporting) | |
| 1 = Unverified (unconfirmed claims, speculation) | |
| 2 = Likely Fake (disinformation, hoaxes, satire misread as fact) | |
| Languages: English, Filipino/Tagalog, Taglish (code-switched) | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| LABEL_NAMES = {0: "Credible", 1: "Unverified", 2: "Likely Fake"} | |
| LABEL_IDS = {v: k for k, v in LABEL_NAMES.items()} | |
| NUM_LABELS = 3 | |
| class Sample: | |
| text: str | |
| label: int # 0 | 1 | 2 | |
| # ββ Full dataset ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # fmt: off | |
| DATASET: list[Sample] = [ | |
| # ββ CREDIBLE (0) ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # English, sourced | |
| Sample("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0), | |
| Sample("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0), | |
| Sample("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0), | |
| Sample("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0), | |
| Sample("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0), | |
| Sample("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0), | |
| Sample("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0), | |
| Sample("DOF confirms revenue collection targets met for fiscal year 2025", 0), | |
| Sample("DSWD distributes relief packs to 10,000 families in Cotabato", 0), | |
| Sample("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0), | |
| Sample("PSA reports Philippine population grows to 115 million as of 2025 census", 0), | |
| Sample("Bangko Sentral ng Pilipinas keeps key policy rate at 6.50 percent", 0), | |
| Sample("DepEd announces K-12 curriculum review following PISA 2024 results", 0), | |
| Sample("PAGASA: Tropical storm Maring to make landfall in Quezon on Tuesday", 0), | |
| Sample("CHED approves 12 new state university programs in Mindanao", 0), | |
| Sample("LTO records 50,000 new vehicle registrations in January 2026", 0), | |
| Sample("PH army recovers high-powered weapons in Sulu operation", 0), | |
| Sample("BIR exceeds tax collection goal by P12 billion in first quarter", 0), | |
| Sample("Phivolcs raises alert level 2 over Mayon Volcano amid increased activity", 0), | |
| Sample("DOTr opens three new MRT-3 stations after P4.8B rehabilitation", 0), | |
| # Filipino/Tagalog sourced news | |
| Sample("Ayon sa DOH, 200 bata ang nagpabakuna laban sa tigdas ngayong linggo sa Maynila", 0), | |
| Sample("Inihayag ng PNP na 15 suspek ang nahuli sa operasyon laban sa droga sa Cavite", 0), | |
| Sample("Inaprubahan ng Senado ang panukalang batas para sa universal healthcare expansion", 0), | |
| Sample("Sinabi ng PAGASA na mula Lunes hanggang Miyerkules ay uulan sa buong Visayas", 0), | |
| Sample("Nakapag-ani ng P3 bilyon ang BIR mula sa drive laban sa tax evasion", 0), | |
| Sample("Nagbigay ng tulong ang DSWD sa 5,000 pamilya na tinamaan ng bagyo sa Leyte", 0), | |
| Sample("Ipinagutos ng Pangulo ang pagreview ng lahat ng kontrata ng DPWH sa mga probinsya", 0), | |
| Sample("Natuklasan ng NBI ang network ng mga pekeng lisensya sa Davao City", 0), | |
| Sample("Naglunsad ng libreng konsultasyon ang DOH para sa mga magsasaka sa Bukidnon", 0), | |
| Sample("Naipasa ng Kamara ang panukalang batas para sa dagdag na benepisyo ng SSS members", 0), | |
| # Taglish | |
| Sample("Sinabi ng BSP governor na ang inflation ay bumaba sa 3.2 percent ngayong Setyembre", 0), | |
| Sample("Ayon sa GMA News, naaprubahan na ng LGU ang P200M budget para sa road repair sa QC", 0), | |
| Sample("Inanunsyo ng DepEd na magkakaroon ng face-to-face classes sa lahat ng public schools", 0), | |
| Sample("Kinumpirma ng MalacaΓ±ang na pumirma na ang Pangulo sa bagong minimum wage law", 0), | |
| # ββ UNVERIFIED (1) ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # English speculation / unconfirmed | |
| Sample("SHOCKING: Politician caught taking selfie during Senate hearing", 1), | |
| Sample("VIRAL: Celebrity spotted at secret meeting with government official", 1), | |
| Sample("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1), | |
| Sample("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1), | |
| Sample("CLAIM: Government hiding true COVID-19 death count from public", 1), | |
| Sample("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1), | |
| Sample("REPORT: Certain barangay officials accepting bribes according to residents", 1), | |
| Sample("Alleged: Shipment of smuggled goods found in Manila port last week", 1), | |
| Sample("CLAIM: New mandatory vaccine policy for all government employees", 1), | |
| Sample("Source says: Manila Water to increase rates by 20% next month", 1), | |
| Sample("Tipster tells media: NBI raid on Binondo warehouse reportedly yielded millions", 1), | |
| Sample("Insiders claim: President to appoint new DILG secretary within the week", 1), | |
| Sample("Leaked memo allegedly shows plans to raise electricity rates next quarter", 1), | |
| Sample("Unverified claim: Bulk of calamity funds in Batangas unaccounted for", 1), | |
| Sample("Social media post alleges senator accepted donations from known oligarch", 1), | |
| Sample("Image circulating online claims to show bribe being given to traffic enforcer", 1), | |
| Sample("Reports suggest government may backtrack on jeepney modernization program", 1), | |
| Sample("Alleged screenshot shows lawmaker voting against bill they publicly supported", 1), | |
| Sample("Claims circulating that MMDA is planning total private vehicle ban in CBD", 1), | |
| Sample("Text messages spreading claim that LBC will suspend operations nationwide", 1), | |
| # Filipino/Tagalog unverified | |
| Sample("AYON SA ISANG PINAGMUMULAN: Magkakaroon daw ng dagdag na lockdown sa Maynila bukas", 1), | |
| Sample("HINDI KUMPIRMADO: Sinasabing mag-aanunsyo ang gobyerno ng bagong curfew sa lahat ng lungsod", 1), | |
| Sample("Di pa totoo? May nagsasabing babalik na daw ang face shields sa lahat ng opisina", 1), | |
| Sample("Mayroon daw nakitang anomalya sa bilangan ng botante sa tatlong lalawigan na ito", 1), | |
| Sample("Alegasyon: Kilalang pulitiko raw ay may kinalaman sa smuggling sa pantalan ng Batangas", 1), | |
| Sample("Kumakalat na mensahe: Daw ipinapatigil ng gobyerno ang distribution ng ayuda", 1), | |
| Sample("Walang kumpirmasyon: Sinasabing may bagong buwis na ipapataw sa social media users", 1), | |
| # Taglish unverified | |
| Sample("May rumor na lumabas na mag-iimpose ng bagong quarantine ang gobyerno ngayong December", 1), | |
| Sample("Allegedly, may senator na caught on camera na nagbibigay ng pera sa opisyal", 1), | |
| Sample("Hindi pa confirmed pero may claim na plano nang ipasara ang ilang major highways", 1), | |
| # ββ LIKELY FAKE (2) βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # English clear disinformation | |
| Sample("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2), | |
| Sample("WATCH: Senator caught stealing money in Senate vault - full video", 2), | |
| Sample("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2), | |
| Sample("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2), | |
| Sample("EXPOSED: Duterte has secret family in Davao that government is hiding", 2), | |
| Sample("100% TRUE: Garlic cures COVID-19, doctors don't want you to know this!", 2), | |
| Sample("Filipino scientist discovers cure for cancer suppressed by big pharma", 2), | |
| Sample("BREAKING: Entire Luzon to experience 3-day total blackout next week", 2), | |
| Sample("MUST SHARE! Government will confiscate all gold and silver from citizens!", 2), | |
| Sample("PROOF: COVID vaccines contain human embryo DNA, Vatican confirms", 2), | |
| Sample("Senator caught on camera eating P500,000 in taxpayer money - PROOF HERE!", 2), | |
| Sample("BIGO ANG GOBYERNO: Pres. Marcos secretly meeting foreign agents to sell Mindanao", 2), | |
| Sample("Doctors silenced after discovering that drinking bleach cures hypertension", 2), | |
| Sample("Vatican secret document shows Philippines is plan to be New World Order hub", 2), | |
| Sample("ALERT: New 5G towers releasing mind-control signals in Metro Manila!", 2), | |
| Sample("Exclusive: Deep state operatives running shadow government from Makati", 2), | |
| Sample("LEAKED VIDEO: General orders troops to shoot civilians in Marawi!", 2), | |
| Sample("FREE ELECTRICITY: Meralco ordered to give free power to all household starting March", 2), | |
| Sample("COVID SECRET: Hospitals paid P200,000 per death certificate listing COVID-19", 2), | |
| Sample("The moon landing was planned in a studio in Manila - Filipino whistleblower reveals", 2), | |
| # Filipino/Tagalog fake | |
| Sample("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2), | |
| Sample("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2), | |
| Sample("PANLOLOKO NG GOBYERNO: Nagtatago raw ng tunay na bilang ng patay sa COVID ang DOH!", 2), | |
| Sample("KATOTOHANAN NA TINATAGO: Ang tubig sa Maynila ay may lason na galing sa gobyerno!", 2), | |
| Sample("PANGANIB: Ang bagong bakuna ay may microchip na para subaybayan ang mga Pilipino!", 2), | |
| Sample("PILIPINAS IBEBENTA! Umabot na sa kasunduan ang gobyerno para ibigay ang Palawan sa China!", 2), | |
| Sample("LUMALABAS NA ANG KATOTOHANAN: Walang tunay na COVID sa ating bansa, ginawa lang ito!", 2), | |
| Sample("SIGURADONG TOTOO: Ang pagkain ng sibuyas ay lunas sa sakit na kanser!", 2), | |
| Sample("BIGLANG BALITA: Bukas ay walang pasok sa lahat ng opisina dahil sa bagong utos!", 2), | |
| Sample("EXPOSED NA SILA: Mga nangungunang Pilipino ay miyembro ng secret na illuminati!", 2), | |
| # Taglish clickbait / disinformation | |
| Sample("OMG! Natuklasan ng mga scientist na ang COVID vaccine pa-DAGA ka sa 5 taon, paki-share!", 2), | |
| Sample("LIBRE NA ANG KURYENTE: Pinagawa na raw ng Pangulo ang Meralco na magbigay ng libre power sa lahat!", 2), | |
| Sample("HINDI SASABIHIN NG GMA AT ABS-CBN: Ang tunay na cure sa COVID ay nasa ating kusina na lang!", 2), | |
| Sample("BIGLANG ANUNSYO: P100,000 para sa bawat mamamayan, pinirmahan na ng Pangulo ang check!", 2), | |
| Sample("VIRAL: Ang tubig sa bote ng Wilkins ay nasubok na may nanalalason na sangkap!", 2), | |
| Sample("NAGISING NA AKO: Ang mga doktor ay tinatago ang katotohanang ang suka ay lunas sa dengue!", 2), | |
| ] | |
| # fmt: on | |
| def get_dataset() -> list[Sample]: | |
| """Return the full dataset.""" | |
| return DATASET | |
| def get_split( | |
| train_ratio: float = 0.8, | |
| seed: int = 42, | |
| ) -> tuple[list[Sample], list[Sample]]: | |
| """ | |
| Split dataset into train / validation sets. | |
| Stratified by label to preserve class balance. | |
| """ | |
| import random | |
| rng = random.Random(seed) | |
| by_label: dict[int, list[Sample]] = {0: [], 1: [], 2: []} | |
| for s in DATASET: | |
| by_label[s.label].append(s) | |
| train, val = [], [] | |
| for label_samples in by_label.values(): | |
| shuffled = label_samples[:] | |
| rng.shuffle(shuffled) | |
| split_idx = max(1, int(len(shuffled) * train_ratio)) | |
| train.extend(shuffled[:split_idx]) | |
| val.extend(shuffled[split_idx:]) | |
| rng.shuffle(train) | |
| rng.shuffle(val) | |
| return train, val | |
| def class_weights(samples: list[Sample]) -> list[float]: | |
| """ | |
| Compute inverse-frequency class weights for imbalanced training. | |
| Returns a list of length NUM_LABELS. | |
| """ | |
| from collections import Counter | |
| counts = Counter(s.label for s in samples) | |
| total = len(samples) | |
| weights = [] | |
| for i in range(NUM_LABELS): | |
| weights.append(total / (NUM_LABELS * max(counts[i], 1))) | |
| return weights | |