Spaces:

rizaaf
/

reCheck

Sleeping

reCheck / main.py

Xinqi04

baru7

bc5fd97 12 months ago

11.8 kB

	import joblib
	import pickle
	import pandas as pd
	import os
	# Atasi masalah permission dengan cache lokal
	import os # ⬅️ PASTIKAN baris ini ada

	# Set direktori cache ke tempat yang diizinkan Hugging Face Spaces
	os.environ["HF_HOME"] = "/tmp/hf_cache"
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache/transformers"
	os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_cache/datasets"


	from sentence_transformers import SentenceTransformer
	from fastapi.middleware.cors import CORSMiddleware
	import re
	import torch
	import numpy as np
	from fastapi import FastAPI, HTTPException
	from fastapi.concurrency import run_in_threadpool
	from pydantic import BaseModel
	from typing import List, Dict, Union, Optional
	import faiss

	from dotenv import load_dotenv
	from supabase import create_client, Client

	load_dotenv()
	SUPABASE_URL = os.environ.get("SUPABASE_URL")
	SUPABASE_KEY = os.environ.get("SUPABASE_KEY")

	if not SUPABASE_URL or not SUPABASE_KEY:
	raise RuntimeError("SUPABASE_URL and SUPABASE_KEY must be set in environment variables or .env file.")

	try:
	supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
	print("Successfully connected to Supabase.")
	except Exception as e:
	raise RuntimeError(f"Could not connect to Supabase: {e}")

	try:
	model_sbert = SentenceTransformer('naufalihsan/indonesian-sbert-large')
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model_sbert.to(device)

	model_lr = joblib.load('models/lr_98_24.pkl')
	loaded_index_faiss = faiss.read_index('embeddings/embeddings_model.faissindex')
	print(f"Indeks FAISS dimuat. Jumlah vektor: {loaded_index_faiss.ntotal}")
	with open('embeddings/metadata_model.pkl', "rb") as f:
	loaded_metadata_list = pickle.load(f)
	print(f"Metadata dimuat. Jumlah item metadata: {len(loaded_metadata_list)}")
	dataset = pd.read_csv('Datasets/final_dataset.csv')
	if 'hoax' in dataset.columns:
	dataset['hoax'] = dataset['hoax'].astype(int)
	except FileNotFoundError as e:
	raise RuntimeError(f"Error loading model or data files: {e}")
	except Exception as e:
	raise RuntimeError(f"An unexpected error occurred during model/data loading: {e}")

	def check_database_schema():
	"""
	Memanggil fungsi RPC di Supabase untuk mendapatkan detail tabel dan kolom.
	"""
	print("Mencoba mengambil skema database dari Supabase...")
	try:
	# Panggil fungsi 'get_schema_details' yang harus sudah Anda buat di Supabase
	response = supabase.rpc('get_schema_details').execute()

	if response.data:
	print("Skema Database Berhasil Diambil:")
	current_table = None
	for row in response.data:
	table_name = row['table_name']
	column_name = row['column_name']
	data_type = row['data_type']

	# Tampilkan nama tabel sekali saja
	if table_name != current_table:
	print(f"\n[Tabel: {table_name}]")
	current_table = table_name

	print(f" - Kolom: {column_name} (Tipe: {data_type})")

	else:
	print("Tidak ada data skema yang ditemukan atau terjadi error.")
	if hasattr(response, 'error') and response.error:
	print(f"Detail Error: {response.error}")

	except Exception as e:
	print(f"Terjadi kesalahan saat mengambil skema: {e}")

	check_database_schema()

	def normalize_text_for_counting(text: str) -> str:
	"""Normalizes text for consistent counting."""
	if not isinstance(text, str):
	return ""
	text_val = text.lower()
	text_val = re.sub(r'[^a-z0-9\s]', '', text_val)
	text_val = re.sub(r'\s+', ' ', text_val).strip()
	return text_val

	def embeddings(text: str) -> np.ndarray:
	text_normalized = text.lower()
	text_normalized = re.sub(r'[^A-Za-z0-9\s\[\]\(\)\{\}\<\>\.,!?;:\'\"-]', '', text_normalized)
	embedding_tensor = model_sbert.encode(text_normalized, show_progress_bar=False, device=device, convert_to_tensor=True)
	return embedding_tensor.cpu().numpy()

	def pipeline_hoax(text: str) -> Dict[str, float]:
	embedding_val = embeddings(text)
	pred_proba = model_lr.predict_proba(embedding_val.reshape(1, -1))[0]
	return {
	"hoax_prob": float(pred_proba[1]),
	"valid_prob": float(pred_proba[0])
	}

	def retrieval(embeddings_val: np.ndarray) -> tuple[np.ndarray, np.ndarray]: # Tipe hint diperjelas
	if embeddings_val.dtype != np.float32:
	embeddings_val = embeddings_val.astype(np.float32)

	embeddings_2d = embeddings_val.reshape(1, -1) if embeddings_val.ndim == 1 else embeddings_val

	distances_2d, indices_2d = loaded_index_faiss.search(embeddings_2d, 5)
	return indices_2d[0], distances_2d[0]


	def retrieval_pipeline(text: str) -> pd.DataFrame:
	embedding_val = embeddings(text)
	idx, score = retrieval(embedding_val)
	result = dataset.iloc[idx].copy()
	result['score'] = score
	return result



	app = FastAPI(
	title="TruthCheck API",
	description="API for Hoax Identification and Article Similarity Retrieval for Indonesian News."
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	class CheckClaimRequest(BaseModel):
	claim_text: str

	class SupportingArticle(BaseModel):
	title: str
	full_text_snippet: str
	url: str
	source: str
	published_year: Optional[str] = None
	hoax_status: str
	similarity_score: float

	class CheckClaimResponse(BaseModel):
	claim_text: str
	hoax_probability: float
	valid_probability: float
	accuracy_percentage: float
	supporting_articles: List[SupportingArticle]

	class PopularCheckItem(BaseModel):
	example_claim_title: str
	claim_text: str
	hoax_probability: float
	valid_probability: float
	accuracy_percentage: float
	supporting_articles: List[SupportingArticle]


	def format_supporting_articles(retrieval_results_df: pd.DataFrame) -> List[SupportingArticle]:
	supporting_articles_list = []
	for _, row in retrieval_results_df.iterrows():
	published_year = str(row.get('Timestamp', ''))
	try:
	if pd.isna(row.get('Timestamp')):
	published_year = None
	else:
	original_ts_str = str(row['Timestamp'])
	if re.match(r'^\d{4}-\d{2}-\d{2}', original_ts_str):
	published_year = pd.to_datetime(original_ts_str).strftime('%Y')
	elif re.match(r'^[A-Za-z]+ \d{1,2}, \d{4}', original_ts_str):
	published_year = pd.to_datetime(original_ts_str).strftime('%Y')
	elif re.match(r'^[A-Za-z]+, \d{1,2} [A-Za-z]+ \d{4}', original_ts_str):
	try:
	published_year = pd.to_datetime(original_ts_str, format='%A, %d %B %Y %H:%M WIB').strftime('%Y')
	except ValueError:
	published_year = pd.to_datetime(original_ts_str.split('WIB')[0].strip()).strftime('%Y') if 'WIB' in original_ts_str else original_ts_str

	else:
	published_year = original_ts_str
	except Exception:
	published_year = str(row.get('Timestamp', ''))

	full_text = str(row.get('FullText', '')) if not pd.isna(row.get('FullText')) else ''

	supporting_articles_list.append(
	SupportingArticle(
	title=str(row.get('Title', '')) if not pd.isna(row.get('Title')) else '',
	full_text_snippet=full_text[:300] + "..." if len(full_text) > 300 else full_text,
	url=str(row.get('Url', '')) if not pd.isna(row.get('Url')) else '',
	source=str(row.get('source', '')) if not pd.isna(row.get('source')) else '',
	published_year=published_year,
	hoax_status="HOAX" if row.get('hoax') == 1 else "VALID",
	similarity_score=float(row['score'])
	)
	)
	return supporting_articles_list

	# --- API Endpoints ---

	@app.post("/check-claim", response_model=CheckClaimResponse, summary="Periksa Klaim/Pernyataan")
	async def check_claim(request: CheckClaimRequest):
	text = request.claim_text
	if not text or not text.strip():
	raise HTTPException(status_code=400, detail="Claim text cannot be empty.")

	normalized_text = normalize_text_for_counting(text)
	print(normalized_text)
	if normalized_text:
	def _sync_upsert_claim(norm_text: str, orig_text: str):
	try:
	response = supabase.rpc(
	'upsert_popular_claim',
	{'p_normalized_text': norm_text, 'p_original_text': orig_text}
	).execute()

	if hasattr(response, 'error') and response.error:
	print(f"Supabase RPC returned an error: {response.error}")
	else:
	print(f"Supabase RPC call successful for: {norm_text[:50]}")

	except Exception as e:
	print(f"Exception during Supabase RPC call: {e}")

	try:
	await run_in_threadpool(_sync_upsert_claim, normalized_text, text)
	except Exception as e:
	print(f"Error running Supabase task in threadpool: {e}")

	# Jalankan pipeline seperti biasa
	hoax_prediction = await run_in_threadpool(pipeline_hoax, text)
	retrieval_results_df = await run_in_threadpool(retrieval_pipeline, text)

	supporting_articles_list = format_supporting_articles(retrieval_results_df)

	return CheckClaimResponse(
	claim_text=text,
	hoax_probability=hoax_prediction["hoax_prob"],
	valid_probability=hoax_prediction["valid_prob"],
	accuracy_percentage=hoax_prediction["valid_prob"] * 100,
	supporting_articles=supporting_articles_list
	)

	@app.get("/popular-checks", response_model=List[PopularCheckItem], summary="Pengecekan Terpopuler")
	async def get_popular_checks(limit: int = 5):
	popular_checks_results = []
	try:
	response = await run_in_threadpool(
	lambda: supabase.table("popular_claims")
	.select("original_claim_text, search_count")
	.order("search_count", desc=True)
	.limit(limit)
	.execute()
	)

	if response.data:
	for item_data in response.data:
	original_text = item_data["original_claim_text"]

	example_title = original_text[:70] + "..." if len(original_text) > 70 else original_text

	hoax_prediction = await run_in_threadpool(pipeline_hoax, original_text)
	retrieval_results_df = await run_in_threadpool(retrieval_pipeline, original_text)

	supporting_articles_list = format_supporting_articles(retrieval_results_df)

	popular_checks_results.append(
	PopularCheckItem(
	example_claim_title=example_title,
	claim_text=original_text,
	hoax_probability=hoax_prediction["hoax_prob"],
	valid_probability=hoax_prediction["valid_prob"],
	accuracy_percentage=hoax_prediction["valid_prob"] * 100,
	supporting_articles=supporting_articles_list
	)
	)
	else:
	print("No popular claims found in Supabase or error in fetching.")
	if hasattr(response, 'error') and response.error:
	print(f"Supabase fetch error: {response.error}")


	except Exception as e:
	print(f"Error fetching popular checks from Supabase: {e}")

	return popular_checks_results