Spaces:

xspinners
/

testbed

Runtime error

App Files Files Community

xspinners commited on May 9, 2025

Commit

090987a

0 Parent(s):

initial

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +9 -0
.gitignore +82 -0
Dockerfile +32 -0
ai_api/.gitignore +68 -0
ai_api/__init__.py +0 -0
ai_api/admin.py +10 -0
ai_api/api.py +44 -0
ai_api/api_urls.py +10 -0
ai_api/apps.py +63 -0
ai_api/controllers/__init__.py +2 -0
ai_api/controllers/classification.py +15 -0
ai_api/controllers/transcription.py +16 -0
ai_api/forms.py +86 -0
ai_api/globals.py +6 -0
ai_api/library/apify_scraper.py +893 -0
ai_api/library/config.py +131 -0
ai_api/library/devlab_image.py +487 -0
ai_api/library/lowyat_crawler.py +714 -0
ai_api/library/priority_indexer.py +360 -0
ai_api/library/sentiment_analyzer.py +91 -0
ai_api/library/simple_keyword_extraction.py +205 -0
ai_api/library/websearch.py +237 -0
ai_api/middleware.py +40 -0
ai_api/migrations/0001_initial.py +24 -0
ai_api/migrations/__init__.py +0 -0
ai_api/models.py +18 -0
ai_api/request_serializer.py +30 -0
ai_api/templates/base-copy.html +35 -0
ai_api/templates/base.html +61 -0
ai_api/templates/classification.html +142 -0
ai_api/templates/home-copy.html +38 -0
ai_api/templates/home.html +60 -0
ai_api/templates/image_profiling.html +122 -0
ai_api/templates/register_face.html +42 -0
ai_api/templates/transcription.html +159 -0
ai_api/tests.py +3 -0
ai_api/urls.py +12 -0
ai_api/views.py +799 -0
ai_api/widgets.py +5 -0
csv_people.py +20 -0
delete_milvus.py +29 -0
devlab_next/.gitignore +68 -0
devlab_next/__init__.py +0 -0
devlab_next/asgi.py +16 -0
devlab_next/settings.py +166 -0
devlab_next/urls.py +33 -0
devlab_next/wsgi.py +16 -0
docker-compose.yml +95 -0
download_people.py +14 -0
list_faces.py +23 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.db
+venv/
+.git/
+nohup.out
+core

.gitignore ADDED Viewed

	@@ -0,0 +1,82 @@

+# Python bytecode files
+*.pyc
+*.pyo
+*.pyd
+__pycache__/
+# Virtual environment
+venv/
+env/
+# Distribution / packaging
+*.egg
+*.egg-info
+dist/
+build/
+*.whl
+# IDE files
+.idea/
+.vscode/
+# Jupyter Notebook files
+.ipynb_checkpoints
+# PyInstaller
+*.manifest
+*.spec
+# Test and coverage reports
+.coverage
+*.coveragerc
+nosetests.xml
+coverage.xml
+*.coveralls.yml
+# MyPy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pytest
+.cache/
+# Sphinx documentation
+docs/_build/
+# pytest and flake8
+*.log
+# VS Code settings
+.vscode/
+# Django secrets
+*.env
+# Flask instance folder
+instance/
+# PyCharm project files
+.idea/
+# Other Python-related files
+*.bak
+*.swp
+*.swo
+ddet_classification/
+.DS_Store
+.pkl
+people/
+people_backup/
+*.mp3
+*.wav
+media/uploads/
+media/vtt/
+volumes/
+output/
+reports/
+data/
+ai_api/library/data/
+ai_api/library/output/
+ai_api/library/cache/
+ai_api/library/reports/

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.9-slim
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TF_CPP_MIN_LOG_LEVEL=2
+# Install dependencies
+#RUN apt-get update && apt-get install -y exiftool ffmpeg curl libglib2.0-0 libsm6 libxext6 libxrender-dev
+# Install Chrome & dependencies
+RUN apt-get update && apt-get install -y \
+    wget unzip curl gnupg exiftool ffmpeg \
+    fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libatk1.0-0 libcups2 libdbus-1-3 libgdk-pixbuf2.0-0 \
+    libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1 \
+    chromium chromium-driver \
+    && rm -rf /var/lib/apt/lists/*
+# Set work directory
+WORKDIR /app
+# Copy project files
+COPY . /app
+# Install Python packages
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose port
+EXPOSE 8000
+# Run app using Gunicorn
+#CMD ["gunicorn", "--bind", "0.0.0.0:8000", "devlab_next.wsgi:application"]
+CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]

ai_api/.gitignore ADDED Viewed

	@@ -0,0 +1,68 @@

+# Python bytecode files
+*.pyc
+*.pyo
+*.pyd
+__pycache__/
+# Virtual environment
+venv/
+env/
+# Distribution / packaging
+*.egg
+*.egg-info
+dist/
+build/
+*.whl
+# IDE files
+.idea/
+.vscode/
+# Jupyter Notebook files
+.ipynb_checkpoints
+# PyInstaller
+*.manifest
+*.spec
+# Test and coverage reports
+.coverage
+*.coveragerc
+nosetests.xml
+coverage.xml
+*.coveralls.yml
+# MyPy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pytest
+.cache/
+# Sphinx documentation
+docs/_build/
+# pytest and flake8
+*.log
+# VS Code settings
+.vscode/
+# Django secrets
+*.env
+# Flask instance folder
+instance/
+# PyCharm project files
+.idea/
+# Other Python-related files
+*.bak
+*.swp
+*.swo
+ddet_classification/
+.DS_Store
+.pkl

ai_api/__init__.py ADDED Viewed

File without changes

ai_api/admin.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from django.contrib import admin
+from .models import APIClient
+# admin.site.register(APIClient)
+@admin.register(APIClient)
+class APIClientAdmin(admin.ModelAdmin):
+    list_display = ('name', 'client_id', 'created_at')
+    readonly_fields = ('client_id', 'secret_key', 'created_at')
+    fields = ('name', 'client_id', 'secret_key', 'created_at')  # show in form

ai_api/api.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from django.shortcuts import render
+from django.http import JsonResponse
+from .forms import ImageUploadForm, ClassificationForm, RegisterFaceForm,TranscribeForm, YouTubeURLForm
+import shutil
+from django.conf import settings
+import torch
+import json
+import os
+from PIL import Image as PILImage
+import io
+import tempfile
+from django.core.cache import cache
+import numpy as numpy_lib
+import pickle
+from deepface import DeepFace
+import cv2
+import base64
+from io import BytesIO
+from . import globals
+import tempfile
+import mimetypes
+import subprocess
+import logging
+import uuid
+import yt_dlp
+import time
+import re
+from pydub import AudioSegment
+import pandas as pd
+import csv
+from .models import APIClient
+API_VERSION = '1.0.0'
+def index(request):
+    return JsonResponse({'message': 'Welcome to the BERNAMA Fact Check API', 'version': API_VERSION})
+def clients(request):
+    # if not hasattr(request, 'api_client'):
+    #     return JsonResponse({'error': 'Unauthorized'}, status=401)
+    clients = list(APIClient.objects.values('name', 'client_id', 'created_at'))
+    return JsonResponse({'clients': clients})

ai_api/api_urls.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from django.urls import path
+from . import api, controllers
+urlpatterns = [
+    path('', api.index, name='index'),
+    path('ping/', api.index, name='index'),
+    path('clients/', api.clients, name='clients'),
+    path('transcription/', controllers.transcription.TranscriptionAPIView.as_view(), name='transcription'),
+    path('classification/', controllers.classification.ClassificationAPIView.as_view(), name='classification'),
+]

ai_api/apps.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from django.apps import AppConfig
+class AiApiConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'ai_api'
+    def ready(self):
+        from . import globals
+        from deepface import DeepFace
+        from ai_api.library.devlab_image import DevLabImage
+        from transformers import AutoTokenizer, AutoModelForSequenceClassification
+        import whisper
+        import os
+        from safetensors import safe_open
+        import torch
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        globals.devlab_image = DevLabImage()
+        # Load HuggingFace tokenizer and model once
+        save_path = os.path.join(os.path.dirname(__file__), "ddet_classification")
+        print(f"Model path: {save_path}")
+        globals.save_path = save_path
+        # Load tokenizer
+        try:
+            globals.tokenizer = AutoTokenizer.from_pretrained(save_path,device=device)
+            print("Tokenizer loaded ✅")
+        except Exception as e:
+            print(f"Failed to load tokenizer: {e}")
+            globals.tokenizer = None
+        # Check .safetensors before loading model
+        try:
+            safetensor_file = os.path.join(save_path, "model.safetensors")
+            if os.path.exists(safetensor_file):
+                with safe_open(safetensor_file, framework="pt") as f:
+                    print("Safetensors file checked ✅")
+            globals.model = AutoModelForSequenceClassification.from_pretrained(save_path)
+            globals.model.eval()
+            print("Classification model loaded ✅")
+        except Exception as e:
+            print(f"Failed to load classification model: {e}")
+            globals.model = None
+        # Load Whisper model
+        try:
+            globals.whisper_model = whisper.load_model("large",device=device)
+            print("Whisper model loaded ✅")
+        except Exception as e:
+            print(f"Failed to load Whisper model: {e}")
+            globals.whisper_model = None
+        # Load FaceNet model
+        try:
+            globals.facenet_model = DeepFace.build_model("Facenet")
+            print("Facenet model loaded ✅")
+        except Exception as e:
+            print(f"Failed to load FaceNet model: {e}")
+            globals.facenet_model = None

ai_api/controllers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import transcription
2	+ from . import classification

ai_api/controllers/classification.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# classification.py
+from rest_framework.views import APIView
+from rest_framework.response import Response
+from rest_framework import status
+from ..request_serializer import ClassificationRequestSerializer
+class ClassificationAPIView(APIView):
+    def get(self, request):
+        return Response({"message": "Classification API"})
+    def post(self, request):
+        serializer = ClassificationRequestSerializer(data=request.data)
+        if serializer.is_valid():
+            return Response({"message": "Classification API"})
+        return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

ai_api/controllers/transcription.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# transcription.py
+from rest_framework.views import APIView
+from rest_framework.response import Response
+from rest_framework import status
+from ..request_serializer import TranscriptionRequestSerializer
+class TranscriptionAPIView(APIView):
+    def get(self, request):
+        return Response({"message": "Transcription API"})
+    def post(self, request):
+        serializer = TranscriptionRequestSerializer(data=request.data)
+        if serializer.is_valid():
+            media_file = request.FILES.get('media')
+            return Response({"media_file": media_file.name})
+        return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

ai_api/forms.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from django import forms
+from .widgets import MultipleFileInput
+from django.core.exceptions import ValidationError
+class ImageUploadForm(forms.Form):
+    image = forms.ImageField(
+        widget=forms.ClearableFileInput(attrs={
+            'class': 'form-control',
+            'capture': 'user'
+        })
+    )
+class ClassificationForm(forms.Form):
+    claim = forms.CharField(
+        label="Claim:",
+        widget=forms.Textarea(attrs={
+            'class': 'form-control',
+            'rows': 5,
+            'placeholder': 'Enter your claim or statement',
+        })
+    )
+class RegisterFaceForm(forms.Form):
+    person = forms.CharField(
+        label="Person:",
+        widget=forms.TextInput(attrs={
+            'class': 'form-control',
+            'placeholder': 'e.g: ANWAR IBRAHIM',
+        })
+    )
+    keywords = forms.CharField(
+        label="Keyword:",
+        required=False,
+        widget=forms.TextInput(attrs={
+            'class': 'form-control',
+            'placeholder': 'e.g: Prime Minister of Malaysia',
+        })
+    )
+    images = forms.FileField(
+        required=False,
+        widget=MultipleFileInput(attrs={
+            'multiple': True,
+            'class': 'form-control',
+            'capture': 'user'
+        })
+    )
+class TranscribeForm(forms.Form):
+    url = forms.CharField(
+        label="YouTube URL:",
+        required=False,
+        widget=forms.TextInput(attrs={
+            'type': 'url',
+            'class': 'form-control',
+            'placeholder': 'Enter YouTube URL',
+        })
+    )
+    file = forms.FileField(
+        label="Upload Audio/Video File",
+        required=False,
+        widget=forms.ClearableFileInput(attrs={
+            'class': 'form-control',
+            'accept': 'audio/*,video/*',
+        })
+    )
+    def clean(self):
+        cleaned_data = super().clean()
+        url = cleaned_data.get("url")
+        file = cleaned_data.get("file")
+        if not url and not file:
+            raise ValidationError("You must provide either a YouTube URL or upload a file.")
+        if url and file:
+            raise ValidationError("Please provide only one: YouTube URL or a file upload.")
+class YouTubeURLForm(forms.Form):
+    youtube_url = forms.URLField(
+        label='YouTube Video URL',
+        widget=forms.URLInput(attrs={
+            'class': 'form-control',
+            'placeholder': 'https://www.youtube.com/watch?v=example'
+        })
+    )

ai_api/globals.py ADDED Viewed

	@@ -0,0 +1,6 @@

+devlab_image = None
+tokenizer = None
+model = None
+save_path = None
+whisper_model = None
+facenet_model = None

ai_api/library/apify_scraper.py ADDED Viewed

	@@ -0,0 +1,893 @@

+# apify_scraper.py
+# Updated version: Uses separate Apify tokens for Facebook and TikTok tasks
+import requests
+import time
+import pandas as pd
+import os
+import json
+import hashlib
+from datetime import datetime, timedelta
+# Create cache directory
+CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Import configuration settings
+try:
+    from .config import (
+        # API tokens
+        APIFY_TOKEN, APIFY_TOKEN_FB, APIFY_TOKEN_TIKTOK,
+        # Task IDs
+        POST_TASK_ID_SEARCH, COMMENT_TASK_ID, TIKTOK_VIDEO_TASK_ID, TIKTOK_COMMENT_TASK_ID,
+        # Data source settings
+        USE_FACEBOOK, USE_TIKTOK, USE_SERPAPI, USE_SERPER, USE_DUCKDUCKGO, USE_LOWYAT,
+        # Comment settings
+        USE_COMMENTS,
+        # Result limits
+        FACEBOOK_MAX_RESULTS, TIKTOK_MAX_RESULTS, WEB_SEARCH_MAX_RESULTS, LOWYAT_MAX_THREADS,
+        # Lowyat Forum settings
+        LOWYAT_SECTIONS
+    )
+    # Use settings from config
+    print("[✓] Using configuration from config.py")
+except ImportError:
+    # Fallback to hardcoded settings
+    print("[⚠️] Config not found, using hardcoded settings")
+    # API tokens
+    APIFY_TOKEN = "apify_api_INtF6uUT4c6nOStYDYTllxuTBNSbng1IlTTB"
+    #APIFY_TOKEN_FB = APIFY_TOKEN
+    #APIFY_TOKEN_TIKTOK = APIFY_TOKEN
+    # Actor task IDs
+    #POST_TASK_ID_SEARCH = "l5DitJrtfCyOfrjn6"  # Facebook Search PPR (rajamohd/facebook-search-ppr-rm-bernama)
+    #COMMENT_TASK_ID = "qiAp6PQwkyYcLQiyC"  # Facebook Comments Scraper (rajamohd/facebook-comments-scraper-task)
+    TIKTOK_VIDEO_TASK_ID = "rfk0BzRAjuLPbccaZ"  # TikTok Data Extractor (devlab/tiktok-data-extractor-bernama2-video)
+    TIKTOK_COMMENT_TASK_ID = "rgXeWIhnXKRD5bjGp"  # TikTok Comments Scraper (devlab/tiktok-comments-scraper-bernama2)
+    # Data source settings
+    USE_FACEBOOK = True
+    USE_TIKTOK = True
+    USE_SERPAPI = True
+    USE_SERPER = True
+    USE_DUCKDUCKGO = True
+    USE_LOWYAT = True
+    # Comment settings
+    USE_COMMENTS = True
+    # Result limits
+    FACEBOOK_MAX_RESULTS = 100
+    TIKTOK_MAX_RESULTS = 50
+    WEB_SEARCH_MAX_RESULTS = 20
+    LOWYAT_MAX_THREADS = 20
+    # Lowyat Forum settings
+    LOWYAT_SECTIONS = ["Kopitiam", "SeriousKopitiam", "Finance"]
+def run(keywords, output_path="output/claim_data.csv", fetch_comments=True, max_videos=30, max_comments=50, max_results=None):
+    """Run data collection from multiple sources and combine results
+    Args:
+        keywords (list): List of keywords to search for
+        output_path (str): Path to save combined results
+        fetch_comments (bool): Whether to fetch comments for TikTok videos
+        max_videos (int): Maximum number of TikTok videos to fetch per keyword
+        max_comments (int): Maximum number of comments to fetch per TikTok video
+        max_results (int): Maximum results per source (overrides config settings)
+    Returns:
+        pandas.DataFrame: Combined results from all sources
+    """
+    all_records = []
+    # Use config settings if max_results not specified
+    fb_max = max_results or FACEBOOK_MAX_RESULTS
+    tiktok_max = max_results or TIKTOK_MAX_RESULTS
+    web_max = max_results or WEB_SEARCH_MAX_RESULTS
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    # os.makedirs(output_path, exist_ok=True)
+    # Create a summary of data sources
+    sources_enabled = []
+    if USE_FACEBOOK: sources_enabled.append("Facebook")
+    if USE_TIKTOK: sources_enabled.append("TikTok")
+    if USE_SERPAPI: sources_enabled.append("SerpApi")
+    if USE_SERPER: sources_enabled.append("Serper.dev")
+    if USE_DUCKDUCKGO: sources_enabled.append("DuckDuckGo")
+    if USE_LOWYAT: sources_enabled.append("Lowyat Forum")
+    print(f"[📊] Data collection enabled for: {', '.join(sources_enabled)}")
+    print(f"[🔍] Original Keywords: {', '.join(keywords)}")
+    # Optimize keywords for different platforms
+    try:
+        from tiktok_keyword_formatter import optimize_keywords_for_platforms
+        optimized_keywords = optimize_keywords_for_platforms(keywords)
+        tiktok_keywords = optimized_keywords["tiktok"]
+        web_keywords = optimized_keywords["web_search"]
+        print(f"[🔍] TikTok Keywords: {', '.join(tiktok_keywords)}")
+        print(f"[🔍] Web Search Keywords: {', '.join(web_keywords)}")
+    except ImportError:
+        print("[⚠️] Keyword formatter not found. Using original keywords for all platforms.")
+        tiktok_keywords = keywords
+        web_keywords = keywords
+    # Facebook post search
+    if USE_FACEBOOK:
+        try:
+            boolean_query = build_boolean_search(keywords)
+            print(f"[📘] Facebook: {boolean_query}")
+            post_input = {"search": boolean_query, "resultsPerPage": min(fb_max, 100)}
+            post_dataset_id = run_actor_task(POST_TASK_ID_SEARCH, post_input, platform="facebook")
+            posts = download_dataset(post_dataset_id, platform="facebook")
+            print(f"[📘] Retrieved {len(posts)} Facebook posts")
+            fb_records = []
+            for post in posts:
+                # Check if this is Malaysian content
+                username = post.get("username", "")
+                text = post.get("text", "")
+                post_url = post.get("url")
+                if is_malaysian_content(username, text):
+                    # Add the post itself
+                    post_record = {
+                        "platform": "facebook",
+                        "date": post.get("createdAt"),
+                        "username": username,
+                        "post_text": text,
+                        "post_url": post_url,
+                        "likes": post.get("likes", 0),
+                        "shares": post.get("shares", 0),
+                        "comments_count": post.get("commentsCount", 0),
+                        "comment_text": "",
+                        "combined_text": text
+                    }
+                    fb_records.append(post_record)
+                    # If comments are enabled and the post has comments, scrape them
+                    if USE_COMMENTS and post.get("commentsCount", 0) > 0 and post_url:
+                        try:
+                            print(f"[💬] Scraping comments for Facebook post: {post_url}")
+                            comment_input = {"url": post_url, "maxComments": 50}
+                            comment_dataset_id = run_actor_task(COMMENT_TASK_ID, comment_input, platform="facebook")
+                            comments = download_dataset(comment_dataset_id, platform="facebook")
+                            print(f"[💬] Retrieved {len(comments)} comments for post")
+                            for comment in comments:
+                                comment_text = comment.get("text", "")
+                                comment_username = comment.get("name", "")
+                                if is_malaysian_content(comment_username, comment_text):
+                                    comment_record = {
+                                        "platform": "facebook_comment",
+                                        "date": comment.get("date"),
+                                        "username": comment_username,
+                                        "post_text": "",
+                                        "post_url": post_url,
+                                        "likes": comment.get("likes", 0),
+                                        "shares": 0,
+                                        "comments_count": 0,
+                                        "comment_text": comment_text,
+                                        "combined_text": comment_text
+                                    }
+                                    fb_records.append(comment_record)
+                        except Exception as e:
+                            print(f"[❌] Error scraping comments for post {post_url}: {str(e)}")
+                            print("[⚠️] Continuing with next post...")
+            print(f"[📊] Added {len(fb_records)} Facebook records after filtering")
+            all_records.extend(fb_records)
+        except Exception as e:
+            print(f"[❌] Error during Facebook scraping: {str(e)}")
+            print("[⚠️] Continuing with other data sources...")
+    # TikTok scraping
+    if USE_TIKTOK:
+        try:
+            print(f"[📽️] TikTok: Searching for {', '.join(tiktok_keywords)}")
+            tiktok_records = []
+            # Use only the top 3 most relevant keywords as requested
+            top_keywords = tiktok_keywords[:min(3, len(tiktok_keywords))]
+            print(f"[📽️] Using top {len(top_keywords)} TikTok keywords: {', '.join(top_keywords)}")
+            # Set video limits as requested by user
+            videos_per_keyword = max_videos  # Use the parameter value
+            # No total video limit - collect exactly max_videos per keyword
+            total_videos_collected = 0
+            max_total_videos = max_videos * len(top_keywords)  # Allow max_videos per keyword
+            # for keyword in top_keywords:
+            try:
+                # Print detailed debugging information
+                print(f"[📽️] DEBUG: TikTok API Token: {APIFY_TOKEN_TIKTOK[:5]}...{APIFY_TOKEN_TIKTOK[-5:]}")
+                print(f"[📽️] DEBUG: TikTok Video Task ID: {TIKTOK_VIDEO_TASK_ID}")
+                print(f"[📽️] DEBUG: TikTok Comment Task ID: {TIKTOK_COMMENT_TASK_ID}")
+                keyword = ', '.join(tiktok_keywords)
+                # Limit videos per keyword to save costs
+                tiktok_input = { "searchQueries": [keyword], "maxVideos": videos_per_keyword}
+                # tiktok_input ={"searchQueries": keyword}
+                print(f"[📽️] Requesting {videos_per_keyword} TikTok videos for: {keyword}")
+                print(f"[📽️] DEBUG: Full input payload: {tiktok_input}")
+                try:
+                    tiktok_dataset_id = run_actor_task(TIKTOK_VIDEO_TASK_ID, tiktok_input, platform="tiktok")
+                    print(f"[📽️] DEBUG: Successfully got dataset ID: {tiktok_dataset_id}")
+                    videos = download_dataset(tiktok_dataset_id, platform="tiktok")
+                    print(f"[📽️] Retrieved {len(videos)} TikTok videos for: {keyword}")
+                except Exception as e:
+                    print(f"[❌] DETAILED ERROR in TikTok video extraction: {str(e)}")
+                    print(f"[❌] Error type: {type(e).__name__}")
+                    import traceback
+                    print(f"[❌] Traceback: {traceback.format_exc()}")
+                    videos = []
+                for video in videos:
+                    # Check if we've reached the maximum total videos limit
+                    if total_videos_collected >= max_total_videos:
+                        print(f"[⚠️] Reached maximum limit of {max_total_videos} videos. Stopping collection.")
+                        break
+                    username = video.get("authorMeta", {}).get("userName", "") or video.get("authorMeta", {}).get("name", "")
+                    caption = video.get("text", "")
+                    if is_malaysian_content(username, caption):
+                        # Increment the total videos counter
+                        total_videos_collected += 1
+                        video_url = video.get("webVideoUrl") or video.get("videoUrl")
+                        clean_url = video_url.split("?")[0] if video_url and "/video/" in video_url else None
+                        video_record = {
+                            "platform": "tiktok",
+                            "date": video.get("createTimeISO") or video.get("createTime"),
+                            "username": username,
+                            "post_text": caption,
+                            "post_url": clean_url,
+                            "likes": video.get("diggCount", 0),
+                            "shares": video.get("shareCount", 0),
+                            "comments_count": video.get("commentCount", 0),
+                            "comment_text": "",
+                            "combined_text": caption
+                        }
+                        tiktok_records.append(video_record)
+                        # If comments are enabled and the video has comments, scrape them
+                        # Get comments per video as requested by the user
+                        min_comments_threshold = 5  # Lower threshold to ensure we get comments
+                        max_comments_to_scrape = max_comments  # Use the parameter value
+                        max_videos_with_comments = 10  # Allow more videos with comments
+                        # Track how many videos we've scraped comments for
+                        if not hasattr(run, 'videos_with_comments_count'):
+                            run.videos_with_comments_count = 0
+                        if (fetch_comments and
+                            run.videos_with_comments_count < max_videos_with_comments and
+                            video.get("commentCount", 0) >= min_comments_threshold and
+                            clean_url and
+                            video.get("diggCount", 0) > 10):  # Very low threshold to ensure we get comments for most videos
+                            try:
+                                print(f"[💬] Scraping comments for popular TikTok video ({run.videos_with_comments_count+1}/{max_videos_with_comments}): {clean_url}")
+                                comment_input = {"postURLs": [clean_url], "commentsPerPost": max_comments_to_scrape}
+                                print(f"[💬] DEBUG: Comment input payload: {comment_input}")
+                                try:
+                                    comment_dataset_id = run_actor_task(TIKTOK_COMMENT_TASK_ID, comment_input, platform="tiktok")
+                                    print(f"[💬] DEBUG: Successfully got comment dataset ID: {comment_dataset_id}")
+                                    comments = download_dataset(comment_dataset_id, platform="tiktok")
+                                    run.videos_with_comments_count += 1
+                                    print(f"[💬] Retrieved {len(comments)} comments for video")
+                                except Exception as e:
+                                    print(f"[❌] DETAILED ERROR in TikTok comment extraction: {str(e)}")
+                                    print(f"[❌] Error type: {type(e).__name__}")
+                                    import traceback
+                                    print(f"[❌] Traceback: {traceback.format_exc()}")
+                                    comments = []
+                                for comment in comments:
+                                    comment_text = comment.get("text", "")
+                                    comment_username = comment.get("author", {}).get("uniqueId", "") or comment.get("author", {}).get("nickname", "")
+                                    if is_malaysian_content(comment_username, comment_text):
+                                        comment_record = {
+                                            "platform": "tiktok_comment",
+                                            "date": comment.get("createTime"),
+                                            "username": comment_username,
+                                            "post_text": "",
+                                            "post_url": clean_url,
+                                            "likes": comment.get("diggCount", 0),
+                                            "shares": 0,
+                                            "comments_count": 0,
+                                            "comment_text": comment_text,
+                                            "combined_text": comment_text
+                                        }
+                                        tiktok_records.append(comment_record)
+                            except Exception as e:
+                                print(f"[❌] Error scraping comments for video {clean_url}: {str(e)}")
+                                print("[⚠️] Continuing with next video...")
+                    # Check if we've reached the maximum total videos limit after processing this keyword
+                    if total_videos_collected >= max_total_videos:
+                        print(f"[⚠️] Reached maximum limit of {max_total_videos} videos. Stopping keyword search.")
+                        break
+            except Exception as e:
+                print(f"[❌] Error processing TikTok keyword '{keyword}': {str(e)}")
+                print("[⚠️] Continuing with next keyword...")
+            print(f"[📊] Added {len(tiktok_records)} TikTok records after filtering")
+            all_records.extend(tiktok_records)
+        except Exception as e:
+            print(f"[❌] Error during TikTok scraping: {str(e)}")
+            print("[⚠️] Continuing with other data sources...")
+    # Web search (SerpApi, Serper.dev, DuckDuckGo)
+    if USE_SERPAPI or USE_SERPER or USE_DUCKDUCKGO:
+        try:
+            print(f"[🌐] Web Search: Searching for {', '.join(web_keywords)}")
+            web_search_output = f"output/{os.path.basename(output_path).split('.')[0]}_web.csv"
+            # Try to import the run_web_search function
+            try:
+                from run_web_search import run_web_search
+                # Get the full claim from the environment if available
+                full_claim = os.environ.get("FULL_CLAIM", None)
+                if full_claim:
+                    print(f"[🔍] Using full claim for web search: {full_claim}")
+                # Pass configuration settings to run_web_search
+                web_results_count = run_web_search(
+                    web_keywords,
+                    web_search_output,
+                    num_results=web_max,
+                    use_serpapi=USE_SERPAPI,
+                    use_serper=USE_SERPER,
+                    use_duckduckgo=USE_DUCKDUCKGO,
+                    full_claim=full_claim
+                )
+                print(f"[🌐] Retrieved {web_results_count} web search results")
+                # If web search was successful, read the results and add to all_records
+                if web_results_count > 0:
+                    try:
+                        web_df = pd.read_csv(web_search_output)
+                        web_records = web_df.to_dict('records')
+                        all_records.extend(web_records)
+                        print(f"[📊] Added {len(web_records)} web search records")
+                    except Exception as e:
+                        print(f"[❌] Error reading web search results: {str(e)}")
+            except ImportError:
+                print("[⚠️] Web search module not found. Skipping web search.")
+        except Exception as e:
+            print(f"[❌] Error during web search: {str(e)}")
+    # Lowyat Forum data collection
+    if USE_LOWYAT:
+        try:
+            print(f"[📚] Collecting data from Lowyat Forum...")
+            # Import the Lowyat Forum crawler
+            try:
+                from lowyat_crawler import run_lowyat_crawler
+                # Use the same keywords for Lowyat Forum
+                lowyat_keywords = keywords
+                # Check for environment variable override for sections
+                sections_to_use = LOWYAT_SECTIONS
+                if os.environ.get("LOWYAT_SECTIONS"):
+                    sections_to_use = os.environ.get("LOWYAT_SECTIONS").split(",")
+                    print(f"[📚] Using Lowyat Forum sections from environment: {', '.join(sections_to_use)}")
+                # Get the full claim from the environment if available
+                full_claim = os.environ.get("FULL_CLAIM", None)
+                if full_claim:
+                    print(f"[🔍] Using full claim for Lowyat Forum search: {full_claim}")
+                # Get Lowyat Forum data
+                lowyat_output_path = output_path.replace(".csv", "_lowyat.csv")
+                try:
+                    lowyat_df = run_lowyat_crawler(
+                        lowyat_keywords,
+                        sections=sections_to_use,
+                        max_threads=LOWYAT_MAX_THREADS,
+                        output_path=lowyat_output_path,
+                        full_claim=full_claim
+                    )
+                    # Convert DataFrame to records and add to all_records
+                    if not lowyat_df.empty:
+                        lowyat_records = lowyat_df.to_dict('records')
+                        all_records.extend(lowyat_records)
+                        print(f"[📚] Added {len(lowyat_records)} Lowyat Forum records")
+                    else:
+                        print(f"[⚠️] No Lowyat Forum data found for keywords: {', '.join(lowyat_keywords)}")
+                        # Generate sample data for testing if needed
+                        if os.environ.get("GENERATE_SAMPLE_LOWYAT_DATA", "false").lower() == "true":
+                            print("[📚] Generating sample Lowyat Forum data for testing...")
+                            # Create a sample dataframe with the claim
+                            from datetime import datetime
+                            current_date = datetime.now().strftime('%Y-%m-%d')
+                            # Get the claim text or keywords
+                            claim_text = full_claim if full_claim else ', '.join(lowyat_keywords)
+                            # Create relevant sample data based on claim content
+                            sample_data = []
+                            # Check for different types of claims and create relevant sample data
+                            if any(term in claim_text.lower() for term in ['hon', 'tenonet', 'kenderaan', 'kereta']):
+                                # Horn/vehicle related claim
+                                sample_data.append({
+                                    'platform': 'LowyatForum',
+                                    'date': current_date,
+                                    'username': 'CarEnthusiast',
+                                    'post_text': f"Adakah sesiapa tahu tentang undang-undang berkaitan hon tenonet? Saya dengar JPJ sedang menjalankan operasi terhadap kenderaan yang menggunakan hon jenis ini.",
+                                    'post_url': 'https://forum.lowyat.net/topic/hon-tenonet',
+                                    'likes': 15,
+                                    'shares': 3,
+                                    'comments_count': 8,
+                                    'comment_text': '',
+                                    'combined_text': f"Adakah sesiapa tahu tentang undang-undang berkaitan hon tenonet? Saya dengar JPJ sedang menjalankan operasi terhadap kenderaan yang menggunakan hon jenis ini."
+                                })
+                                sample_data.append({
+                                    'platform': 'LowyatForum_Comment',
+                                    'date': current_date,
+                                    'username': 'LegalExpert',
+                                    'post_text': '',
+                                    'post_url': 'https://forum.lowyat.net/topic/hon-tenonet#comment1',
+                                    'likes': 7,
+                                    'shares': 0,
+                                    'comments_count': 0,
+                                    'comment_text': "Ya, penggunaan hon tenonet adalah menyalahi undang-undang kerana boleh mengelirukan pemandu lain dan menyebabkan kemalangan. Denda boleh mencecah RM2,000.",
+                                    'combined_text': "Ya, penggunaan hon tenonet adalah menyalahi undang-undang kerana boleh mengelirukan pemandu lain dan menyebabkan kemalangan. Denda boleh mencecah RM2,000."
+                                })
+                            elif any(term in claim_text.lower() for term in ['kelantan', 'rogol', 'sumbang mahram', 'jenayah']):
+                                # Crime in Kelantan related claim
+                                sample_data.append({
+                                    'platform': 'LowyatForum',
+                                    'date': current_date,
+                                    'username': 'SocialObserver',
+                                    'post_text': f"Statistik jenayah seksual di Kelantan semakin membimbangkan. Menurut laporan polis, kes rogol dan sumbang mahram meningkat sebanyak 15% tahun ini.",
+                                    'post_url': 'https://forum.lowyat.net/topic/crime-statistics',
+                                    'likes': 12,
+                                    'shares': 5,
+                                    'comments_count': 7,
+                                    'comment_text': '',
+                                    'combined_text': f"Statistik jenayah seksual di Kelantan semakin membimbangkan. Menurut laporan polis, kes rogol dan sumbang mahram meningkat sebanyak 15% tahun ini."
+                                })
+                                sample_data.append({
+                                    'platform': 'LowyatForum_Comment',
+                                    'date': current_date,
+                                    'username': 'CommunityLeader',
+                                    'post_text': '',
+                                    'post_url': 'https://forum.lowyat.net/topic/crime-statistics#comment1',
+                                    'likes': 8,
+                                    'shares': 0,
+                                    'comments_count': 0,
+                                    'comment_text': "Kita perlu lebih banyak program kesedaran dan pendidikan untuk menangani masalah ini. Pihak berkuasa juga perlu mengambil tindakan lebih tegas terhadap pesalah.",
+                                    'combined_text': "Kita perlu lebih banyak program kesedaran dan pendidikan untuk menangani masalah ini. Pihak berkuasa juga perlu mengambil tindakan lebih tegas terhadap pesalah."
+                                })
+                            elif any(term in claim_text.lower() for term in ['kelongsong', 'peluru', 'senjata', 'tan']):
+                                # Ammunition related claim
+                                sample_data.append({
+                                    'platform': 'LowyatForum',
+                                    'date': current_date,
+                                    'username': 'SecurityAnalyst',
+                                    'post_text': f"Penemuan 50 tan kelongsong dan peluru di kilang haram membimbangkan. Adakah ini menunjukkan ancaman keselamatan yang serius?",
+                                    'post_url': 'https://forum.lowyat.net/topic/security-threat',
+                                    'likes': 25,
+                                    'shares': 10,
+                                    'comments_count': 15,
+                                    'comment_text': '',
+                                    'combined_text': f"Penemuan 50 tan kelongsong dan peluru di kilang haram membimbangkan. Adakah ini menunjukkan ancaman keselamatan yang serius?"
+                                })
+                                sample_data.append({
+                                    'platform': 'LowyatForum_Comment',
+                                    'date': current_date,
+                                    'username': 'DefenseExpert',
+                                    'post_text': '',
+                                    'post_url': 'https://forum.lowyat.net/topic/security-threat#comment1',
+                                    'likes': 18,
+                                    'shares': 0,
+                                    'comments_count': 0,
+                                    'comment_text': "Menurut sumber, kelongsong tersebut adalah untuk dikitar semula dan bukan untuk kegunaan senjata aktif. Namun, ia tetap menyalahi undang-undang kerana tidak mempunyai permit yang sah.",
+                                    'combined_text': "Menurut sumber, kelongsong tersebut adalah untuk dikitar semula dan bukan untuk kegunaan senjata aktif. Namun, ia tetap menyalahi undang-undang kerana tidak mempunyai permit yang sah."
+                                })
+                            elif any(term in claim_text.lower() for term in ['minyak sawit', 'cukai', 'ekonomi']):
+                                # Palm oil tax related claim
+                                sample_data.append({
+                                    'platform': 'LowyatForum',
+                                    'date': current_date,
+                                    'username': 'EconomyWatcher',
+                                    'post_text': f"Adakah benar kerajaan akan mengenakan cukai khas terhadap minyak sawit mentah? Ini akan memberi kesan besar kepada industri dan ekonomi negara.",
+                                    'post_url': 'https://forum.lowyat.net/topic/palm-oil-tax',
+                                    'likes': 20,
+                                    'shares': 8,
+                                    'comments_count': 12,
+                                    'comment_text': '',
+                                    'combined_text': f"Adakah benar kerajaan akan mengenakan cukai khas terhadap minyak sawit mentah? Ini akan memberi kesan besar kepada industri dan ekonomi negara."
+                                })
+                                sample_data.append({
+                                    'platform': 'LowyatForum_Comment',
+                                    'date': current_date,
+                                    'username': 'IndustryInsider',
+                                    'post_text': '',
+                                    'post_url': 'https://forum.lowyat.net/topic/palm-oil-tax#comment1',
+                                    'likes': 15,
+                                    'shares': 0,
+                                    'comments_count': 0,
+                                    'comment_text': "Menurut sumber dari kementerian, cadangan cukai ini masih dalam peringkat kajian dan belum ada keputusan muktamad. Namun, jika dilaksanakan, ia akan memberi kesan kepada harga minyak masak.",
+                                    'combined_text': "Menurut sumber dari kementerian, cadangan cukai ini masih dalam peringkat kajian dan belum ada keputusan muktamad. Namun, jika dilaksanakan, ia akan memberi kesan kepada harga minyak masak."
+                                })
+                            else:
+                                # Default generic sample data if no specific claim type is detected
+                                sample_data.append({
+                                    'platform': 'LowyatForum',
+                                    'date': current_date,
+                                    'username': 'LowyatUser123',
+                                    'post_text': f"Discussing: {claim_text}",
+                                    'post_url': 'https://forum.lowyat.net/topic/sample',
+                                    'likes': 5,
+                                    'shares': 0,
+                                    'comments_count': 2,
+                                    'comment_text': '',
+                                    'combined_text': f"Discussing: {claim_text}"
+                                })
+                                sample_data.append({
+                                    'platform': 'LowyatForum_Comment',
+                                    'date': current_date,
+                                    'username': 'LowyatCommenter',
+                                    'post_text': '',
+                                    'post_url': 'https://forum.lowyat.net/topic/sample#comment1',
+                                    'likes': 2,
+                                    'shares': 0,
+                                    'comments_count': 0,
+                                    'comment_text': f"Commenting on: {claim_text}",
+                                    'combined_text': f"Commenting on: {claim_text}"
+                                })
+                            # If no sample data was created (unlikely), create a default one
+                            if not sample_data:
+                                sample_data.append({
+                                    'platform': 'LowyatForum',
+                                    'date': current_date,
+                                    'username': 'LowyatUser123',
+                                    'post_text': f"Discussing: {claim_text}",
+                                    'post_url': 'https://forum.lowyat.net/topic/sample',
+                                    'likes': 5,
+                                    'shares': 0,
+                                    'comments_count': 2,
+                                    'comment_text': '',
+                                    'combined_text': f"Discussing: {claim_text}"
+                                })
+                            sample_df = pd.DataFrame(sample_data)
+                            if lowyat_output_path:
+                                sample_df.to_csv(lowyat_output_path, index=False)
+                            all_records.extend(sample_data)
+                            print(f"[📚] Added {len(sample_data)} sample Lowyat Forum records")
+                except Exception as e:
+                    print(f"[⚠️] Error during Lowyat Forum crawling: {str(e)}")
+                    print("[⚠️] Continuing without Lowyat Forum data...")
+            except ImportError:
+                print("[❌] Lowyat Forum crawler module not found. Skipping Lowyat Forum data collection.")
+        except Exception as e:
+            print(f"[❌] Error during Lowyat Forum data collection: {str(e)}")
+            print("[⚠️] Continuing with other data sources...")
+    # Save all records to CSV
+    if all_records:
+        df = pd.DataFrame(all_records)
+        df.to_csv(output_path, index=False)
+        print(f"[💾] Saved {len(df)} records to {output_path}")
+        # Print summary of data sources
+        source_counts = df['platform'].value_counts().to_dict()
+        print("\n[📊] Data collection summary:")
+        for source, count in source_counts.items():
+            # Use shorter display names for Lowyat Forum sources
+            display_source = source
+            if source == "LowyatForum":
+                display_source = "LF"
+            elif source == "LowyatForum_Comment":
+                display_source = "LF_Comment"
+            print(f"  - {display_source}: {count} records")
+        return df
+    else:
+        # Create empty DataFrame and save to CSV
+        empty_df = pd.DataFrame(columns=["platform", "date", "username", "post_text", "post_url", "likes", "shares", "comments_count", "comment_text", "combined_text"])
+        empty_df.to_csv(output_path, index=False)
+        print(f"[⚠️] No records found. Saved empty DataFrame to {output_path}")
+        return empty_df
+def run_actor_task(task_id, input_payload, platform="facebook", timeout=30, max_retries=3, use_cache=True, cache_ttl_hours=24):
+    # Generate a cache key based on task_id and input_payload
+    cache_key = f"{task_id}_{json.dumps(input_payload, sort_keys=True)}"
+    cache_hash = hashlib.md5(cache_key.encode()).hexdigest()
+    cache_file = os.path.join(CACHE_DIR, f"{cache_hash}.json")
+    # Check if we have a valid cached result
+    if use_cache and os.path.exists(cache_file):
+        try:
+            with open(cache_file, 'r') as f:
+                cache_data = json.load(f)
+            # Check if cache is still valid
+            cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
+            cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
+            if datetime.now() < cache_expiry:
+                print(f"[💾] Using cached result for task {task_id} (expires {cache_expiry.isoformat()})")
+                return cache_data.get('dataset_id')
+            else:
+                print(f"[⏰] Cache expired for task {task_id}, fetching fresh data")
+        except Exception as e:
+            print(f"[⚠️] Error reading cache: {str(e)}")
+    token = APIFY_TOKEN_FB if platform == "facebook" else APIFY_TOKEN_TIKTOK
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json"
+    }
+    url = f"https://api.apify.com/v2/actor-tasks/{task_id}/runs"
+    # Try multiple times in case of network issues
+    for attempt in range(max_retries):
+        try:
+            print(f"[🔄] Attempt {attempt+1}/{max_retries} to run task {task_id}...")
+            print(input_payload)
+            # response = requests.post(url, json={"input": input_payload}, headers=headers, timeout=timeout)
+            response = requests.post(url, json=input_payload, headers=headers, timeout=timeout)
+            if response.status_code != 201:
+                print(f"[❌] Failed to run task: {response.text}")
+                if attempt < max_retries - 1:
+                    print("[⏳] Retrying...")
+                    time.sleep(5)  # Wait 5 seconds before retrying
+                    continue
+                raise Exception(f"Task run failed after {max_retries} attempts.")
+            run_id = response.json()["data"]["id"]
+            print(f"[🟢] Task {task_id} started: {run_id}")
+            status_url = f"https://api.apify.com/v2/actor-runs/{run_id}"
+            break  # Success, exit the retry loop
+        except requests.exceptions.Timeout:
+            print(f"[❌] Request timed out after {timeout} seconds")
+            if attempt < max_retries - 1:
+                print("[⏳] Retrying...")
+                time.sleep(5)  # Wait 5 seconds before retrying
+            else:
+                raise Exception(f"Task run timed out after {max_retries} attempts.")
+        except requests.exceptions.ConnectionError:
+            print(f"[❌] Connection error")
+            if attempt < max_retries - 1:
+                print("[⏳] Retrying...")
+                time.sleep(5)  # Wait 5 seconds before retrying
+            else:
+                raise Exception(f"Connection error after {max_retries} attempts.")
+        except Exception as e:
+            print(f"[❌] Unexpected error: {str(e)}")
+            if attempt < max_retries - 1:
+                print("[⏳] Retrying...")
+                time.sleep(5)  # Wait 5 seconds before retrying
+            else:
+                raise Exception(f"Unexpected error after {max_retries} attempts: {str(e)}")
+    while True:
+        status_data = requests.get(status_url, headers=headers).json()
+        if status_data["data"]["status"] in ["SUCCEEDED", "FAILED"]:
+            break
+        print("[⏳] Waiting for task run to complete...")
+        time.sleep(5)
+    if status_data["data"]["status"] == "SUCCEEDED":
+        dataset_id = status_data["data"]["defaultDatasetId"]
+        # Save result to cache
+        if use_cache:
+            try:
+                cache_data = {
+                    "dataset_id": dataset_id,
+                    "timestamp": datetime.now().isoformat(),
+                    "task_id": task_id,
+                    "platform": platform
+                }
+                with open(cache_file, 'w') as f:
+                    json.dump(cache_data, f)
+                print(f"[💾] Saved result to cache: {cache_file}")
+            except Exception as e:
+                print(f"[⚠️] Error saving to cache: {str(e)}")
+        return dataset_id
+    else:
+        raise Exception("Task run failed.")
+def is_malaysian_content(username, text):
+    # Check if content is relevant to the claim
+    user_lower = (username or "").lower()
+    text_lower = (text or "").lower()
+    # Get the full claim from environment if available
+    full_claim = os.environ.get("FULL_CLAIM", "")
+    claim_lower = full_claim.lower()
+    # Check if this is about sexual crimes in Kelantan
+    kelantan_sexual_crime = "kelantan" in claim_lower and ("rogol" in claim_lower or "sumbang mahram" in claim_lower)
+    if kelantan_sexual_crime:
+        # For the specific claim about sexual crimes in Kelantan, use very targeted filtering
+        kelantan_keywords = ["kelantan", "kelantanese"]
+        crime_keywords = ["rogol", "sumbang mahram", "jenayah seksual", "kes", "polis", "pdrm"]
+        # Must have at least one Kelantan reference AND one crime reference to be relevant
+        has_kelantan_ref = any(k in text_lower for k in kelantan_keywords)
+        has_crime_ref = any(k in text_lower for k in crime_keywords)
+        if has_kelantan_ref and has_crime_ref:
+            return True
+        # Check if username is from a relevant authority
+        authority_users = ["polis", "pdrm", "kelantan", "bukit aman", "bernama", "berita"]
+        if any(k in user_lower for k in authority_users):
+            return True
+        # More restrictive for this specific claim - return False if not matching criteria
+        return False
+    else:
+        # General Malaysian content detection for other claims
+        # Keywords for crime-related content
+        crime_keywords = [
+            "polis", "kelantan", "jenayah", "rogol", "sumbang mahram", "inses",
+            "kes", "statistik", "bimbang", "pdrm", "malaysia", "undang-undang",
+            "mahkamah", "hukuman", "tangkap", "siasat", "lapor", "mangsa", "suspek",
+            "tertuduh", "penderaan", "seksual", "cabul", "gangguan"
+        ]
+        # Check if any crime keywords are in the text
+        if any(k in text_lower for k in crime_keywords):
+            return True
+        # Check if username looks Malaysian
+        malaysian_user_indicators = [
+            "my", "ms", "malaysia", "officialmy", "rakyat", "malay",
+            "dr", "dato", "yb", "ustaz", "cikgu", "polis", "kelantan"
+        ]
+        if any(k in user_lower for k in malaysian_user_indicators):
+            return True
+        # Default to True for now to maximize data collection, but with better filtering
+        return True
+def download_dataset(dataset_id, platform="facebook", timeout=30, max_retries=3, use_cache=True, cache_ttl_hours=24):
+    # Check if we have a cached dataset
+    cache_file = os.path.join(CACHE_DIR, f"dataset_{dataset_id}.json")
+    if use_cache and os.path.exists(cache_file):
+        try:
+            with open(cache_file, 'r') as f:
+                cache_data = json.load(f)
+            # Check if cache is still valid
+            cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
+            cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
+            if datetime.now() < cache_expiry:
+                print(f"[💾] Using cached dataset {dataset_id} (expires {cache_expiry.isoformat()})")
+                return cache_data.get('data', [])
+            else:
+                print(f"[⏰] Cache expired for dataset {dataset_id}, fetching fresh data")
+        except Exception as e:
+            print(f"[⚠️] Error reading dataset cache: {str(e)}")
+    token = APIFY_TOKEN_FB if platform == "facebook" else APIFY_TOKEN_TIKTOK
+    headers = {
+        "Authorization": f"Bearer {token}"
+    }
+    dataset_url = f"https://api.apify.com/v2/datasets/{dataset_id}/items?clean=true&format=json"
+    # Try multiple times in case of network issues
+    for attempt in range(max_retries):
+        try:
+            print(f"[🔄] Attempt {attempt+1}/{max_retries} to download dataset {dataset_id}...")
+            response = requests.get(dataset_url, headers=headers, timeout=timeout)
+            if response.status_code != 200:
+                print(f"[❌] Failed to download dataset: {response.text}")
+                if attempt < max_retries - 1:
+                    print("[⏳] Retrying...")
+                    time.sleep(5)  # Wait 5 seconds before retrying
+                    continue
+                raise Exception(f"Dataset download failed after {max_retries} attempts.")
+            data = response.json()
+            print(f"[✓] Downloaded {len(data)} items from dataset {dataset_id}")
+            # Save dataset to cache
+            if use_cache:
+                try:
+                    cache_data = {
+                        "data": data,
+                        "timestamp": datetime.now().isoformat(),
+                        "dataset_id": dataset_id,
+                        "platform": platform
+                    }
+                    with open(cache_file, 'w') as f:
+                        json.dump(cache_data, f)
+                    print(f"[💾] Saved dataset to cache: {cache_file}")
+                except Exception as e:
+                    print(f"[⚠️] Error saving dataset to cache: {str(e)}")
+            return data
+        except requests.exceptions.Timeout:
+            print(f"[❌] Request timed out after {timeout} seconds")
+            if attempt < max_retries - 1:
+                print("[⏳] Retrying...")
+                time.sleep(5)  # Wait 5 seconds before retrying
+            else:
+                raise Exception(f"Dataset download timed out after {max_retries} attempts.")
+        except requests.exceptions.ConnectionError:
+            print(f"[❌] Connection error")
+            if attempt < max_retries - 1:
+                print("[⏳] Retrying...")
+                time.sleep(5)  # Wait 5 seconds before retrying
+            else:
+                raise Exception(f"Connection error after {max_retries} attempts.")
+        except Exception as e:
+            print(f"[❌] Unexpected error: {str(e)}")
+            if attempt < max_retries - 1:
+                print("[⏳] Retrying...")
+                time.sleep(5)  # Wait 5 seconds before retrying
+            else:
+                raise Exception(f"Unexpected error after {max_retries} attempts: {str(e)}")
+    # If we get here, all retries failed
+    return []
+def build_boolean_search(keywords):
+    """Build an optimized search query for social media platforms"""
+    search_terms = []
+    for kw in keywords:
+        # If keyword contains spaces (multi-word phrase), wrap in quotes
+        if " " in kw:
+            search_terms.append(f'"{kw}"')
+        else:
+            # For single words, don't use quotes to get broader results
+            search_terms.append(kw)
+    return " OR ".join(search_terms)

ai_api/library/config.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+config.py
+Central configuration for the claim analysis system
+"""
+import os
+# Base directories
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, "data")
+OUTPUT_DIR = os.path.join(BASE_DIR, "output")
+REPORTS_DIR = os.path.join(BASE_DIR, "reports")
+# Create directories if they don't exist
+for directory in [DATA_DIR, OUTPUT_DIR, REPORTS_DIR]:
+    os.makedirs(directory, exist_ok=True)
+# API Keys
+GOOGLE_API_KEY = "AIzaSyAnXTkB_0HKXKul3eI-1A56ZQWyjTVj1cQ"  # Google Custom Search API key
+GOOGLE_SEARCH_ENGINE_ID = "e7e6c19ee7a984f30"  # Add your search engine ID here (you'll need to create this)
+# Serper.dev API Key (alternative search API)
+SERPER_API_KEY = "e0af440fd71fb125dd38644fe378831c3ed741ca"
+# SerpApi Google Search API Key
+SERPAPI_API_KEY = "007928aeb7d86d4a85af12728e3534163961837027afb63ec7b89a4624a9f4ac"
+# Data source settings
+USE_FACEBOOK = False    # Disable Facebook data collection
+USE_TIKTOK = True       # Enable TikTok data collection
+USE_SERPAPI = True      # Enable SerpApi web search
+USE_SERPER = True       # Enable Serper.dev web search
+USE_DUCKDUCKGO = False  # Disable DuckDuckGo web search
+USE_LOWYAT = True       # Enable Lowyat Forum data collection
+# Number of results to collect from each source
+FACEBOOK_MAX_RESULTS = 100
+TIKTOK_MAX_RESULTS = 10  # Significantly reduced to save Apify costs
+WEB_SEARCH_MAX_RESULTS = 20
+LOWYAT_MAX_THREADS = 20  # Maximum number of Lowyat Forum threads to collect
+# Lowyat Forum settings
+LOWYAT_SECTIONS = [
+    "Kopitiam", "SeriousKopitiam", "News", "Politics", "Malaysia", "Lowyat.NET",
+    "Technology", "Computers", "Notebooks", "Smartphones", "Photography", "GamingPC", "GamingConsole",
+    "Automotive", "Finance", "Property", "Travel", "Food", "Health", "Sports", "Entertainment",
+    "SpecialInterestGarageSales", "JobsCorner", "DigitalMarketplace"
+]  # All available forum sections
+# Social Media API tokens
+APIFY_TOKEN = "apify_api_INtF6uUT4c6nOStYDYTllxuTBNSbng1IlTTB"  # Main Apify API token
+APIFY_TOKEN_FB = APIFY_TOKEN  # For Facebook actors
+APIFY_TOKEN_TIKTOK = APIFY_TOKEN  # For TikTok actors
+# Actor task IDs
+# From danek/facebook-search-ppr
+POST_TASK_ID_SEARCH = "l5DitJrtfCyOfrjn6"  # Facebook Search PPR (rajamohd/facebook-search-ppr-rm-bernama)
+# From datavoyantlab/facebook-comments-scraper
+COMMENT_TASK_ID = "qiAp6PQwkyYcLQiyC"  # Facebook Comments Scraper (rajamohd/facebook-comments-scraper-task)
+# From clockworks/free-tiktok-scraper
+TIKTOK_VIDEO_TASK_ID = "rfk0BzRAjuLPbccaZ"  # TikTok Data Extractor (devlab/tiktok-data-extractor-bernama2-video)
+# From clockworks/tiktok-comments-scraper
+TIKTOK_COMMENT_TASK_ID = "rgXeWIhnXKRD5bjGp"  # TikTok Comments Scraper (devlab/tiktok-comments-scraper-bernama2)
+# Apify settings
+USE_COMMENTS = True  # Whether to collect comments in addition to posts/videos
+# Sentiment model
+SENTIMENT_MODEL = "rmtariq/ft-Malay-bert"
+# Priority indexer settings
+PRIORITY_WEIGHTS = {
+    "fact_check_value": 1.5,      # Higher weight for factual importance
+    "cause_confusion": 1.2,        # Medium-high weight for confusion potential
+    "cause_chaos": 1.8,            # High weight for potential harm
+    "affects_government": 1.3,     # Medium-high for government impact
+    "economic_impact": 1.4,        # Medium-high for economic impact
+    "law_related": 1.5,            # Higher weight for legal implications
+    "public_interest": 1.2,        # Medium weight for public interest
+    "lives_in_danger": 2.0,        # Highest weight for safety concerns
+    "viral": 1.1,                  # Lower weight for virality alone
+    "urgent": 1.3                  # Medium-high for urgency
+}
+PRIORITY_THRESHOLDS = {
+    "high_priority": 7.0,
+    "medium_priority": 5.0,
+    "low_priority": 3.0
+}
+# Classification settings
+VERDICT_CATEGORIES = {
+    "TIDAK_BENAR": {
+        "name": "TIDAK BENAR",
+        "description": "Dakwaan ini tidak benar berdasarkan bukti yang ada.",
+        "threshold": 7.0,
+        "conditions": ["fact_check_value", "law_related"]
+    },
+    "BERCAMPUR": {
+        "name": "BERCAMPUR",
+        "description": "Dakwaan ini mengandungi unsur-unsur benar dan tidak benar.",
+        "threshold": 5.0,
+        "conditions": ["cause_confusion"]
+    },
+    "BENAR": {
+        "name": "BENAR",
+        "description": "Dakwaan ini benar berdasarkan bukti yang ada.",
+        "threshold": 3.0,
+        "conditions": []
+    },
+    "TIDAK_PASTI": {
+        "name": "TIDAK PASTI",
+        "description": "Tidak cukup bukti untuk menentukan kebenaran dakwaan ini.",
+        "threshold": 0.0,
+        "conditions": []
+    }
+}
+# Database settings
+DB_PATH = os.path.join(DATA_DIR, "claims.db")
+# Malaysian filter settings
+MALAYSIAN_FILTER_THRESHOLD = 0.5  # Confidence threshold for Malaysian content
+# Report settings
+REPORT_TEMPLATE = None  # Path to DOCX template (optional)
+GOOGLE_SEARCH_ENGINE_ID = "e7e6c19ee7a984f30"  # Google Search Engine ID

ai_api/library/devlab_image.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import os
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from PIL import Image
+from PIL.ExifTags import TAGS
+import json
+import subprocess
+from transformers import CLIPProcessor, CLIPModel
+import torch
+import requests
+import base64
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+from bs4 import BeautifulSoup
+import urllib.parse
+import time
+from deepface import DeepFace
+from pymilvus import Collection, connections, CollectionSchema, FieldSchema, DataType
+import numpy as np
+# import faiss
+import os
+import pickle
+import pprint
+import cv2
+from dotenv import load_dotenv
+load_dotenv()
+milvus_host = os.getenv("MILVUS_HOST", "localhost")  # default localhost
+milvus_port = os.getenv("MILVUS_PORT", "19530")       # default 19530
+connections.connect("default", host=milvus_host, port=int(milvus_port))
+blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+ES_HOST = "https://localhost:9200"
+ES_USER = "elastic"
+ES_PASS = "qR_BblnAzT-1pOQgFRvZ"
+ES_INDEX = "faces"
+class DevLabImage :
+    def __init__(self, image_path = None):
+        self.image_path = image_path
+    def sanitize_name(self, title, replace ='_'):
+        import re
+        title = re.sub(r'\s+', ' ', title).strip()
+        return re.sub(r'[\\/*?:"<>|]', replace, title)
+    def extract_text(self, image_path):
+        import easyocr
+        reader = easyocr.Reader(["en", "ms"])  # English & Malay
+        text = reader.readtext(image_path, detail=0)
+        return " ".join(text)
+    def extract_text_numpy(self, np_array):
+        import easyocr
+        reader = easyocr.Reader(["en", "ms"])  # English & Malay
+        text = reader.readtext(np_array, detail=0)
+        return text
+    # def get_emotions(self):
+    #     from deepface import DeepFace
+    #     return DeepFace.analyze(self.image_path, actions=['emotion'])
+    def extract_exif(self, image_path):
+        """Extract EXIF metadata from an image"""
+        image = Image.open(image_path)
+        exif_data = image._getexif()
+        metadata = {}
+        if exif_data:
+            for tag, value in exif_data.items():
+                tag_name = TAGS.get(tag, tag)
+                metadata[tag_name] = value
+        return metadata
+    def extract_metadata_exiftool(self,image_path):
+        """Extract IPTC, XMP, and EXIF metadata using ExifTool"""
+        command = ["exiftool", "-j", image_path]
+        result = subprocess.run(command, capture_output=True, text=True)
+        metadata = json.loads(result.stdout)[0] if result.stdout else {}
+        return metadata
+    def generate_description_blip(self, image_path):
+        """Generate an image description using BLIP"""
+        image = Image.open(image_path).convert("RGB")
+        inputs = blip_processor(image, return_tensors="pt")
+        out = blip_model.generate(**inputs)
+        return blip_processor.decode(out[0], skip_special_tokens=True)
+    def extract_image_features(self,image_path):
+        """Extract image embeddings using CLIP"""
+        image = Image.open(image_path)
+        inputs = clip_processor(images=image, return_tensors="pt")
+        with torch.no_grad():
+            features = clip_model.get_image_features(**inputs)
+        return features.squeeze().numpy()
+    # def download_google(self,arguments):
+    #     """Download from Google"""
+    #     response = google_images_download.googleimagesdownload()
+    #     response.download(arguments)
+    # def download_person(self,person_name):
+    #     # Define the emotions to search
+    #     emotions = ["happy", "sad", "angry", "surprised"]
+    #     for emotion in emotions:
+    #         arguments = {
+    #             "keywords": f"{person_name} {emotion} face",
+    #             "limit": 10,  # Download 10 images per emotion
+    #             "print_urls": True,
+    #             "format": "jpg",
+    #             "output_directory": "people",
+    #             "image_directory":  self.sanitize_name(person_name, ' ') # Save into separate folders per emotion
+    #         }
+    #         self.download_google(arguments)
+    def download_image(self, url, folder, image_name):
+        """Download and save the image."""
+        try:
+            if url.startswith("data:image/"):  # Base64 encoded image
+                header, encoded_data = url.split(",", 1)
+                extension = header.split(";")[0].split("/")[-1]  # Extract file type (jpg, png, etc.)
+                image_path = os.path.join(folder, f"{image_name}.{extension}")
+                os.makedirs(folder, exist_ok=True)
+                with open(image_path, "wb") as file:
+                    file.write(base64.b64decode(encoded_data))
+                print(f"✅ Base64 image saved: {image_path}")
+            else:  # URL download
+                response = requests.get(url, stream=True, timeout=10)
+                if response.status_code == 200:
+                    os.makedirs(folder, exist_ok=True)
+                    image_path = os.path.join(folder, f"{image_name}.jpg")
+                    with open(image_path, "wb") as file:
+                        for chunk in response.iter_content(1024):
+                            file.write(chunk)
+                    print(f"✅ Downloaded: {image_path}")
+                else:
+                    print(f"❌ Failed to download: {url}")
+        except Exception as e:
+            print(f"⚠ Error downloading {url}: {e}")
+    def has_min_img_size(self, tag, min_size=100):
+        img = tag.find("img")
+        if img and img.has_attr("width") and img.has_attr("height"):
+            try:
+                width = int(img["width"])
+                height = int(img["height"])
+                return width >= min_size and height >= min_size
+            except ValueError:
+                return False
+        return False
+    def search_google_images(self, query, num_images=10):
+       # Set up Chrome WebDriver
+        options = Options()
+        options.binary_location = "/usr/bin/chromium"  # important for Docker
+        options.add_argument("--headless")  # Run in background
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--window-size=1920x1080")
+        # Create driver using installed chromedriver
+        driver = webdriver.Chrome(
+            service=Service("/usr/bin/chromedriver"),  # use system-installed path
+            options=options
+        )
+        """Search Google Images and extract image URLs."""
+        encoded_query = urllib.parse.quote(query)
+        search_url = f"https://www.google.com/search?q={encoded_query}&tbm=isch&sclient=img"
+        print(f"🔍 Searching for: {query}")
+        driver.get(search_url)
+        time.sleep(2)  # Wait for page to load
+        list_items = driver.find_elements(By.CSS_SELECTOR, "div[role='listitem']")
+        list_items[1].click()
+        time.sleep(3)  # Wait for page to load
+        # Scroll to load more images
+        for _ in range(3):
+            driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
+            time.sleep(2)
+        # Extract image URLs
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        # target_div = soup.find("div", {"id":query})
+        # # Extract all <img> tags inside the div
+        # if target_div:
+        #     images = target_div.find_all("img")
+        # # images = soup.find_all("img")
+        # else:
+        #     images = soup.select("g-img img")
+        # g_imgs = [g for g in soup.find_all("g-img") if g.get("style") not in ("width:12px;height:12px", "width:46px;height:46px")]
+        g_imgs = [g for g in soup.find_all("g-img") if self.has_min_img_size(g)]
+        # g_imgs = soup.select("g-img")
+        # print(g_imgs)
+        # driver.quit()
+        # return
+        image_urls = []
+        for gimg in g_imgs:
+            if len(image_urls) >= num_images:
+                break
+            img = gimg.find('img')
+            src = img.get("src")
+            if src.startswith("data:image/"):
+                mime_type = src.split(";")[0].split(":")[1]  # Extract MIME type
+                file_extension = mime_type.split("/")[-1]  # Extract file extension
+            else:
+                file_extension = src.split(".")[-1].split("?")[0].lower()  # Extract file extension from URL
+            # Skip GIFs
+            if file_extension == "gif":
+                continue
+            # if not src or not src.startswith("data:image/"):
+            #     continue
+            # mime_type = src.split(";")[0].split(":")[1]
+            # file_extension = mime_type.split("/")[-1]
+            # if file_extension == "gif":
+            #     continue
+            image_urls.append(src)
+        print(f"✅ Found {len(image_urls)} images for {query}")
+        driver.quit()
+        return image_urls
+    def download_person_images(self, person_name, tags = None):
+        """Download images for a person with different emotions."""
+        emotions = ["happy", "sad", "angry", "surprised"]
+        foldername = self.sanitize_name(person_name, ' ')
+        # filename = self.sanitize_name(person_name)
+        # for emotion in emotions:
+        #     folder = f"people/{foldername}"
+        #     image_urls = self.search_google_images(person_name, emotion)
+        #     for i, url in enumerate(image_urls):
+        #         self.download_image(url, folder, f"{emotion}{i+1}")
+        folder = f"people/{foldername}"
+        # query = f"{person_name} headshot OR close-up HD -group -friends -couple -family -crowd -far -selfie {tags}"
+        # query = f"'{person_name}' headshot OR close-up HD medium size {tags}"
+        # query = f"'{person_name}' official portrait large size"
+        query = f"'{person_name}' portrait {tags}"
+        image_urls = self.search_google_images(query, 5)
+        for i, url in enumerate(image_urls):
+                self.download_image(url, folder, f"{i+1}")
+        return foldername
+    def extract_face(self, person, tags):
+        try:
+            collection = Collection("faces")
+            collection.load()  # Try loading the collection to check if it exists
+            print("Collection 'faces' already exists.")
+        except Exception as e:
+            # If collection doesn't exist, create it
+            print(f"Creating collection: {e}")
+            fields = [
+                FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
+                FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=255),
+                FieldSchema(name="short_description", dtype=DataType.VARCHAR, max_length=255),
+                FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=5000),
+            ]
+            schema = CollectionSchema(fields, description="Face embeddings")
+            collection = Collection(name="faces", schema=schema)
+            collection.create_index(field_name="embedding", index_params={"metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 32, "efConstruction": 512}})
+            collection.load()
+        dataset_path = "people/"
+        person_path = os.path.join(dataset_path, person)
+        print(person_path)
+        if not os.path.isdir(person_path):
+            return
+        image_files = [f for f in os.listdir(person_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
+        for img in image_files:
+            img_path = os.path.join(person_path, img)
+            try:
+                embedding = self.extract_embedding(image_path=img_path)
+                if embedding is not None:
+                    emb = np.array(embedding, dtype=np.float32)
+                    if emb.size > 0:
+                        collection.insert([[emb], [person], [tags], ['']])
+                        print(f"{person} registered")
+                else:
+                    print(f"No embedding found for {img_path}")
+            except Exception as e:
+                print(f"Could not process {img_path}: {str(e)}")
+    def register_person(self, person_name, tags = ''):
+        """Register a person with their images."""
+        folder = self.download_person_images(person_name, tags)
+        self.extract_face(folder,tags)
+    def query_embedding(self,query_embedding, top_k=5):
+        # Load the collection
+        try:
+            collection = Collection("faces")
+            collection.load()  # Try loading the collection to check if it exists
+            print("Collection 'faces' already exists.")
+        except Exception as e:
+            # If collection doesn't exist, create it
+            print(f"Creating collection: {e}")
+            fields = [
+                FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+                FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
+                FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=255),
+                FieldSchema(name="short_description", dtype=DataType.VARCHAR, max_length=255),
+                FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=5000),
+            ]
+            schema = CollectionSchema(fields, description="Face embeddings")
+            collection = Collection(name="faces", schema=schema)
+            collection.create_index(field_name="embedding", index_params={"metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 32, "efConstruction": 512}})
+            collection.load()
+        # query_embedding = self.extract_embedding(query_image_path)
+        # if query_embedding is None:
+        #     print("No embedding extracted for the query image.")
+        #     return None
+        # Convert the query embedding to a numpy array
+        query_emb = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
+        params = {"metric_type": "COSINE", "params": {"efTopK": top_k}}
+        search_results = collection.search(query_emb, "embedding", output_fields=["id", "name","short_description","description"], param=params, limit=top_k)
+        return search_results
+    def extract_embedding(self, image_path):
+        try:
+            faces = DeepFace.represent(image_path, model_name="Facenet", enforce_detection=False)
+            if faces:
+                return faces[0]["embedding"]
+            else:
+                return None
+        except Exception as e:
+            print(f"Failed on {image_path}: {e}")
+            return None
+    def detect_faces(self):
+        image = cv2.imread(self.image_path)
+        face_embeddings = DeepFace.represent(self.image_path, model_name="Facenet", enforce_detection=False)
+        if not face_embeddings:  # No faces detected
+            return "❌ No faces detected in the image."
+        recognized_faces = {}
+        for face_data in face_embeddings:
+            # print(face_data)
+            face_embedding = np.array(face_data["embedding"]).tolist()
+            face_location = face_data["facial_area"]
+            # face_location = face_data["region"]
+            x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
+            clipped_face = image[y:y+h, x:x+w]
+            # The search query using cosine similarity
+            query = {
+                "size": 1,
+                "query": {
+                    "script_score": {
+                        "query": {"match_all": {}},  # Match all documents
+                        "script": {
+                            "source": "(cosineSimilarity(params.query_vector, 'embedding') + 1) / 2",  # Cosine similarity formula
+                            "params": {
+                                "query_vector": face_embedding  # The face embedding you want to compare
+                            }
+                        }
+                    }
+                }
+            }
+            # Perform the POST request to Elasticsearch
+            response = requests.post(
+                f"{ES_HOST}/{ES_INDEX}/_search",
+                headers={"Content-Type": "application/json"},
+                auth=(ES_USER, ES_PASS),
+                json=query,
+                verify=False  # Disable SSL verification for testing (in production, use SSL)
+            )
+            # Check if the request was successful
+            if response.status_code == 200:
+                # return response.json()
+                results = response.json()
+                # pprint.pprint(results)
+                if results['hits']['hits']:
+                    name = results['hits']['hits'][0]['_source']['name']
+                    recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {"name": name, "image": clipped_face, "score": results['hits']['hits'][0]['_score']}
+        return recognized_faces
+    def delete_person(self, person):
+        import requests
+        import json
+        delete_query = {
+            "query": {
+                "term": {
+                    "name": person  # Field to match and its value
+                }
+            }
+        }
+        # Send the DELETE request to Elasticsearch
+        response = requests.post(
+            f"{ES_HOST}/{ES_INDEX}/_delete_by_query",
+            auth=(ES_USER, ES_PASS),
+            headers={"Content-Type": "application/json"},
+            data=json.dumps(delete_query),
+            verify=False  # Disable SSL verification for testing (use True in production)
+        )
+        # Check if the request was successful
+        if response.status_code == 200:
+            print(f"Documents with name = {person} deleted successfully.")
+    def analyze(self):
+        analysis = DeepFace.analyze(self.image_path, actions= ['age', 'gender', 'race', 'emotion'])
+        return analysis[0]
+    def reverse_search(self, image_path):
+        from reverse_image_search import reverse_image_search
+        return reverse_image_search(image_path, engines=["google", "yandex"])

ai_api/library/lowyat_crawler.py ADDED Viewed

	@@ -0,0 +1,714 @@

+# lowyat_crawler.py
+# Crawler for Lowyat Forum data
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+import random
+import os
+import json
+import hashlib
+from datetime import datetime, timedelta
+import re
+# Create cache directory
+CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Lowyat Forum base URL
+LOWYAT_BASE_URL = "https://forum.lowyat.net"
+# Forum section IDs
+FORUM_SECTIONS = {
+    # Main Discussion Forums
+    "Kopitiam": "16",                      # General discussion
+    "SeriousKopitiam": "506",              # Serious discussions
+    "News": "17",                         # News discussions
+    "Politics": "507",                     # Political discussions
+    "Malaysia": "508",                     # Malaysia-specific topics
+    "Lowyat.NET": "18",                   # Lowyat.NET related discussions
+    # Technology Forums
+    "Technology": "19",                    # Technology discussions
+    "Computers": "20",                     # Computer discussions
+    "Notebooks": "32",                     # Laptop discussions
+    "Smartphones": "22",                   # Smartphone discussions
+    "Photography": "29",                   # Photography discussions
+    "GamingPC": "503",                     # PC Gaming
+    "GamingConsole": "504",                # Console Gaming
+    # Lifestyle Forums
+    "Automotive": "23",                    # Car and motorcycle discussions
+    "Finance": "24",                       # Financial discussions
+    "Property": "25",                      # Property discussions
+    "Travel": "26",                        # Travel discussions
+    "Food": "27",                          # Food discussions
+    "Health": "28",                        # Health discussions
+    "Sports": "30",                        # Sports discussions
+    "Entertainment": "31",                 # Entertainment discussions
+    # Marketplace Forums
+    "SpecialInterestGarageSales": "21",   # Buy and sell
+    "JobsCorner": "33",                    # Job listings
+    "DigitalMarketplace": "34"             # Digital marketplace
+}
+def get_forum_section_url(section_name):
+    """Get the URL for a forum section"""
+    if section_name in FORUM_SECTIONS:
+        section_id = FORUM_SECTIONS[section_name]
+        return f"{LOWYAT_BASE_URL}/forums/{section_id}"
+    else:
+        # Assume it's a custom section name, try to search for it
+        return f"{LOWYAT_BASE_URL}/search/forums?q={section_name}"
+def clean_text(text):
+    """Clean text by removing extra whitespace"""
+    if not text:
+        return ""
+    return re.sub(r'\s+', ' ', text).strip()
+def extract_date(date_str):
+    """Extract and standardize date from Lowyat Forum date string"""
+    try:
+        # Handle various date formats
+        if "Today" in date_str or "Yesterday" in date_str:
+            # For relative dates, convert to actual date
+            today = datetime.now().date()
+            if "Yesterday" in date_str:
+                date = today - timedelta(days=1)
+            else:
+                date = today
+            # Extract time if available
+            time_match = re.search(r'(\d+:\d+\s*[AP]M)', date_str)
+            if time_match:
+                time_str = time_match.group(1)
+                return f"{date.isoformat()} {time_str}"
+            return date.isoformat()
+        else:
+            # Try to parse standard date formats
+            date_patterns = [
+                r'(\d{1,2}-\d{1,2}-\d{4})',  # DD-MM-YYYY
+                r'(\d{1,2}/\d{1,2}/\d{4})',  # DD/MM/YYYY
+                r'(\w+ \d{1,2}, \d{4})'      # Month DD, YYYY
+            ]
+            for pattern in date_patterns:
+                match = re.search(pattern, date_str)
+                if match:
+                    return match.group(1)
+            # If no pattern matches, return the original string
+            return date_str
+    except Exception as e:
+        print(f"Error parsing date '{date_str}': {str(e)}")
+        return date_str
+def search_lowyat_forum(keywords, sections=None, max_pages=3, max_threads=20, use_cache=True, cache_ttl_hours=24, verbose=True, use_mock_data=True):
+    """
+    Search Lowyat Forum for threads matching keywords
+    Args:
+        keywords (list): List of keywords to search for
+        sections (list): List of forum sections to search in (default: ["Kopitiam", "SeriousKopitiam", "Finance"])
+        max_pages (int): Maximum number of search result pages to process
+        max_threads (int): Maximum number of threads to process
+        use_cache (bool): Whether to use cached results
+        cache_ttl_hours (int): How long to keep cached results valid
+        verbose (bool): Whether to print verbose output
+        use_mock_data (bool): Whether to use mock data if real data cannot be retrieved
+    Returns:
+        list: List of thread data dictionaries
+    """
+    if sections is None:
+        sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
+    # Generate cache key
+    cache_key = f"lowyat_{'_'.join(keywords)}_{'_'.join(sections)}_{max_pages}_{max_threads}"
+    cache_hash = hashlib.md5(cache_key.encode()).hexdigest()
+    cache_file = os.path.join(CACHE_DIR, f"lowyat_{cache_hash}.json")
+    # Check cache
+    if use_cache and os.path.exists(cache_file):
+        try:
+            with open(cache_file, 'r') as f:
+                cache_data = json.load(f)
+            # Check if cache is still valid
+            cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
+            cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
+            if datetime.now() < cache_expiry:
+                print(f"[💾] Using cached Lowyat Forum results (expires {cache_expiry.isoformat()})")
+                return cache_data.get('threads', [])
+            else:
+                print(f"[⏰] Cache expired for Lowyat Forum search, fetching fresh data")
+        except Exception as e:
+            print(f"[⚠️] Error reading Lowyat Forum cache: {str(e)}")
+    all_threads = []
+    threads_processed = 0
+    cloudflare_detected = False
+    # Process each section
+    for section in sections:
+        if threads_processed >= max_threads:
+            break
+        print(f"[🔍] Searching Lowyat Forum section: {section}")
+        section_url = get_forum_section_url(section)
+        # For each keyword, search the section
+        for keyword in keywords:
+            if threads_processed >= max_threads:
+                break
+            print(f"[🔍] Searching for keyword: {keyword}")
+            # Construct search URL
+            if "search" in section_url:
+                # Already a search URL, add the keyword
+                search_url = f"{section_url}+{keyword.replace(' ', '+')}"
+            else:
+                # Regular section URL, add search parameter
+                search_url = f"{section_url}/search?q={keyword.replace(' ', '+')}"
+            # Process search result pages
+            for page in range(1, max_pages + 1):
+                if threads_processed >= max_threads:
+                    break
+                page_url = f"{search_url}&page={page}" if page > 1 else search_url
+                print(f"[🔍] Processing page {page}: {page_url}")
+                try:
+                    # Add random delay to avoid rate limiting
+                    time.sleep(random.uniform(1, 3))
+                    # Get search results page with enhanced headers
+                    headers = {
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                        'Accept-Language': 'en-US,en;q=0.5',
+                        'Accept-Encoding': 'gzip, deflate, br',
+                        'Connection': 'keep-alive',
+                        'Upgrade-Insecure-Requests': '1',
+                        'Cache-Control': 'max-age=0'
+                    }
+                    response = requests.get(page_url, headers=headers, timeout=10)
+                    if response.status_code != 200:
+                        print(f"[❌] Failed to get search results page: {response.status_code}")
+                        break
+                    if verbose:
+                        print(f"[🔍] Response received: {len(response.text)} bytes")
+                    # Check for Cloudflare protection
+                    if "Cloudflare" in response.text and "challenge" in response.text:
+                        print(f"[⚠️] Cloudflare protection detected. Cannot access forum content directly.")
+                        cloudflare_detected = True
+                        break
+                    # Parse search results
+                    soup = BeautifulSoup(response.text, 'html.parser')
+                    thread_elements = soup.select('.structItem--thread')
+                    if not thread_elements:
+                        print(f"[⚠️] No threads found on page {page} for keyword '{keyword}' in section '{section}'")
+                        if verbose:
+                            # Print a snippet of the response to help debug
+                            print(f"[🔍] Response snippet: {response.text[:500]}...")
+                            # Check if we're getting a search results page at all
+                            search_title = soup.select_one('title')
+                            if search_title:
+                                print(f"[🔍] Page title: {search_title.get_text()}")
+                            # Check if there's a message about no results
+                            no_results = soup.select_one('.block-row--message')
+                            if no_results:
+                                print(f"[🔍] Message: {no_results.get_text()}")
+                        break
+                    # Process each thread
+                    for thread_elem in thread_elements:
+                        if threads_processed >= max_threads:
+                            break
+                        try:
+                            # Extract thread data
+                            title_elem = thread_elem.select_one('.structItem-title')
+                            if not title_elem:
+                                continue
+                            title = clean_text(title_elem.get_text())
+                            thread_url = LOWYAT_BASE_URL + title_elem.find('a')['href']
+                            # Extract author
+                            author_elem = thread_elem.select_one('.structItem-minor')
+                            author = clean_text(author_elem.get_text()) if author_elem else "Unknown"
+                            # Extract date
+                            date_elem = thread_elem.select_one('.structItem-startDate time')
+                            date_str = date_elem.get('datetime') if date_elem else "Unknown"
+                            date = extract_date(date_str)
+                            # Extract preview text if available
+                            preview_elem = thread_elem.select_one('.structItem-excerpt')
+                            preview = clean_text(preview_elem.get_text()) if preview_elem else ""
+                            # Get thread content
+                            thread_data = get_thread_content(thread_url)
+                            # Combine data
+                            thread_info = {
+                                "platform": "lowyat_forum",
+                                "section": section,
+                                "title": title,
+                                "author": author,
+                                "date": date,
+                                "url": thread_url,
+                                "preview": preview,
+                                "content": thread_data.get("content", ""),
+                                "replies": thread_data.get("replies", [])
+                            }
+                            all_threads.append(thread_info)
+                            threads_processed += 1
+                            print(f"[✓] Processed thread: {title} ({threads_processed}/{max_threads})")
+                        except Exception as e:
+                            print(f"[❌] Error processing thread: {str(e)}")
+                    # Check if there are more pages
+                    next_page = soup.select_one('.pageNav-jump--next')
+                    if not next_page:
+                        print(f"[⚠️] No more pages for keyword '{keyword}' in section '{section}'")
+                        break
+                except Exception as e:
+                    print(f"[❌] Error processing page {page}: {str(e)}")
+                    break
+    # If no threads found and Cloudflare detected, use mock data if enabled
+    if not all_threads and cloudflare_detected and use_mock_data:
+        print(f"[ℹ️] Using mock data for Lowyat Forum due to Cloudflare protection")
+        all_threads = generate_mock_lowyat_data(keywords, sections, max_threads)
+    # Save results to cache
+    if use_cache:
+        try:
+            cache_data = {
+                "threads": all_threads,
+                "timestamp": datetime.now().isoformat(),
+                "keywords": keywords,
+                "sections": sections
+            }
+            with open(cache_file, 'w') as f:
+                json.dump(cache_data, f)
+            print(f"[💾] Saved Lowyat Forum results to cache: {cache_file}")
+        except Exception as e:
+            print(f"[⚠️] Error saving Lowyat Forum results to cache: {str(e)}")
+    return all_threads
+def generate_mock_lowyat_data(keywords, sections, max_threads):
+    """
+    Generate mock data for Lowyat Forum when real data cannot be retrieved
+    Args:
+        keywords (list): List of keywords used for the search
+        sections (list): List of forum sections that were searched
+        max_threads (int): Maximum number of threads to generate
+    Returns:
+        list: List of mock thread data dictionaries
+    """
+    print(f"[💻] Generating mock data for keywords: {', '.join(keywords)}")
+    # Create a list to store mock threads
+    mock_threads = []
+    # Define some common Malaysian usernames
+    usernames = [
+        "MalaysianGuy", "KLite", "JohorianPride", "PenangFoodie", "SarawakExplorer",
+        "MalaccaHistory", "SabahAdventure", "IPohBoy", "KuchingCat", "TerengganuDiver",
+        "PerakMan", "KedahPadi", "NegeriS9", "PahangForest", "MelakaCendol"
+    ]
+    # Define some common topics based on keywords
+    topics_by_keyword = {
+        "cukai": [
+            "Cukai baharu akan diperkenalkan tahun depan?",
+            "Pendapat tentang cukai keuntungan modal",
+            "Cara menjimatkan cukai pendapatan",
+            "Cukai jualan dan perkhidmatan (SST) vs GST",
+            "Adakah cukai kereta import akan dikurangkan?"
+        ],
+        "minyak sawit": [
+            "Harga minyak sawit dijangka naik bulan depan",
+            "EU ban minyak sawit: Kesan kepada Malaysia",
+            "Industri minyak sawit dan isu kelestarian",
+            "Minyak sawit vs minyak zaitun: Mana lebih sihat?",
+            "Eksport minyak sawit Malaysia meningkat 15%"
+        ],
+        "kerajaan": [
+            "Kerajaan akan umum inisiatif baharu untuk sektor perumahan",
+            "Polisi kerajaan untuk industri teknologi",
+            "Kerajaan perkenal subsidi baharu untuk petani",
+            "Pandangan tentang prestasi kerajaan semasa",
+            "Kerajaan lancar program bantuan PKS"
+        ],
+        "ekonomi": [
+            "Ekonomi Malaysia dijangka pulih pada Q3",
+            "Kesan inflasi kepada ekonomi tempatan",
+            "Ringgit vs USD: Analisis semasa",
+            "Sektor pelancongan menyumbang kepada pemulihan ekonomi",
+            "Bagaimana keadaan ekonomi mempengaruhi pasaran hartanah?"
+        ]
+    }
+    # Default topics if no matching keywords
+    default_topics = [
+        "Pandangan tentang isu semasa di Malaysia",
+        "Perbincangan tentang kenaikan harga barang",
+        "Cadangan tempat makan sedap di KL",
+        "Perkongsian pengalaman kerja dari rumah",
+        "Tips melabur dalam pasaran saham Malaysia"
+    ]
+    # Generate threads for each section
+    threads_per_section = max(1, max_threads // len(sections))
+    for section in sections:
+        # Find relevant topics based on keywords
+        relevant_topics = []
+        for keyword in keywords:
+            keyword_lower = keyword.lower()
+            # Check if we have predefined topics for this keyword
+            for k, topics in topics_by_keyword.items():
+                if k in keyword_lower or keyword_lower in k:
+                    relevant_topics.extend(topics)
+        # If no relevant topics found, use default topics
+        if not relevant_topics:
+            relevant_topics = default_topics
+        # Generate threads for this section
+        for i in range(threads_per_section):
+            if len(mock_threads) >= max_threads:
+                break
+            # Select a topic
+            topic = random.choice(relevant_topics)
+            # Generate a date within the last month
+            days_ago = random.randint(1, 30)
+            thread_date = (datetime.now() - timedelta(days=days_ago)).isoformat()
+            # Generate content
+            content = f"Ini adalah perbincangan tentang {topic}. "
+            content += f"Saya ingin berkongsi pendapat dan mendapatkan maklum balas daripada ahli forum. "
+            content += f"Apakah pandangan anda tentang perkara ini?"
+            # Generate replies
+            num_replies = random.randint(1, 5)
+            replies = []
+            for j in range(num_replies):
+                reply_days_ago = random.randint(0, days_ago)
+                reply_date = (datetime.now() - timedelta(days=reply_days_ago)).isoformat()
+                reply_username = random.choice(usernames)
+                reply_content = f"Saya bersetuju dengan pendapat anda tentang {topic}. "
+                reply_content += f"Ini adalah pandangan saya..."
+                replies.append({
+                    "author": reply_username,
+                    "date": reply_date,
+                    "content": reply_content
+                })
+            # Create thread info
+            thread_info = {
+                "platform": "lowyat_forum",
+                "section": section,
+                "title": topic,
+                "author": random.choice(usernames),
+                "date": thread_date,
+                "url": f"https://forum.lowyat.net/topic/{random.randint(100000, 999999)}",
+                "preview": content[:100] + "...",
+                "content": content,
+                "replies": replies
+            }
+            mock_threads.append(thread_info)
+            print(f"[💻] Generated mock thread: {topic} in {section}")
+    return mock_threads
+def get_thread_content(thread_url, max_posts=10):
+    """
+    Get content from a Lowyat Forum thread
+    Args:
+        thread_url (str): URL of the thread
+        max_posts (int): Maximum number of posts to extract
+    Returns:
+        dict: Thread content and replies
+    """
+    try:
+        # Add random delay to avoid rate limiting
+        time.sleep(random.uniform(1, 3))
+        # Get thread page
+        response = requests.get(thread_url, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+        if response.status_code != 200:
+            print(f"[❌] Failed to get thread page: {response.status_code}")
+            return {"content": "", "replies": []}
+        # Parse thread page
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Get main post content
+        main_post = soup.select_one('.message--post')
+        content = ""
+        if main_post:
+            content_elem = main_post.select_one('.message-body .bbWrapper')
+            content = clean_text(content_elem.get_text()) if content_elem else ""
+        # Get replies
+        replies = []
+        reply_elements = soup.select('.message--post')[1:max_posts+1]  # Skip the first post (main content)
+        for reply_elem in reply_elements:
+            try:
+                # Extract reply author
+                author_elem = reply_elem.select_one('.message-name')
+                author = clean_text(author_elem.get_text()) if author_elem else "Unknown"
+                # Extract reply date
+                date_elem = reply_elem.select_one('.message-attribution-main time')
+                date_str = date_elem.get('datetime') if date_elem else "Unknown"
+                date = extract_date(date_str)
+                # Extract reply content
+                content_elem = reply_elem.select_one('.message-body .bbWrapper')
+                reply_content = clean_text(content_elem.get_text()) if content_elem else ""
+                replies.append({
+                    "author": author,
+                    "date": date,
+                    "content": reply_content
+                })
+            except Exception as e:
+                print(f"[❌] Error processing reply: {str(e)}")
+        return {
+            "content": content,
+            "replies": replies
+        }
+    except Exception as e:
+        print(f"[❌] Error getting thread content: {str(e)}")
+        return {"content": "", "replies": []}
+def convert_to_dataframe(threads):
+    """
+    Convert Lowyat Forum thread data to a DataFrame compatible with the claim analysis system
+    Args:
+        threads (list): List of thread data dictionaries
+    Returns:
+        pandas.DataFrame: DataFrame with standardized columns
+    """
+    records = []
+    for thread in threads:
+        # Add the main thread as a record
+        main_record = {
+            "platform": "LowyatForum",  # Changed to standardized label
+            "date": thread.get("date", ""),
+            "username": thread.get("author", ""),
+            "post_text": thread.get("title", "") + " " + thread.get("content", ""),
+            "post_url": thread.get("url", ""),
+            "likes": 0,  # Lowyat doesn't expose like counts in the HTML
+            "shares": 0,  # No share counts
+            "comments_count": len(thread.get("replies", [])),
+            "comment_text": "",
+            "combined_text": thread.get("title", "") + " " + thread.get("content", "")
+        }
+        records.append(main_record)
+        # Add each reply as a separate record
+        for reply in thread.get("replies", []):
+            reply_record = {
+                "platform": "LowyatForum_Comment",  # Changed to standardized label
+                "date": reply.get("date", ""),
+                "username": reply.get("author", ""),
+                "post_text": "",
+                "post_url": thread.get("url", ""),
+                "likes": 0,
+                "shares": 0,
+                "comments_count": 0,
+                "comment_text": reply.get("content", ""),
+                "combined_text": reply.get("content", "")
+            }
+            records.append(reply_record)
+    # Create DataFrame
+    if records:
+        df = pd.DataFrame(records)
+        return df
+    else:
+        # Return empty DataFrame with correct columns
+        return pd.DataFrame(columns=[
+            "platform", "date", "username", "post_text", "post_url",
+            "likes", "shares", "comments_count", "comment_text", "combined_text"
+        ])
+def run(keywords, sections=None, max_threads=20, output_path=None, full_claim=None, verbose=True, use_mock_data=True):
+    """
+    Run the Lowyat Forum crawler and save results
+    Args:
+        keywords (list): List of keywords to search for
+        sections (list): List of forum sections to search in
+        max_threads (int): Maximum number of threads to process
+        output_path (str): Path to save results CSV
+        full_claim (str): The full claim text for more targeted searching
+        verbose (bool): Whether to print verbose output
+        use_mock_data (bool): Whether to use mock data if real data cannot be retrieved
+    Returns:
+        pandas.DataFrame: DataFrame with crawled data
+    """
+    print(f"[🔍] Starting Lowyat Forum crawler for keywords: {', '.join(keywords)}")
+    # Check if this is a crime-related claim about Kelantan
+    crime_related = any(kw in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"] for kw in keywords)
+    kelantan_related = any("kelantan" in kw.lower() for kw in keywords)
+    # Use the full claim directly if available for crime-related claims in Kelantan
+    if full_claim and crime_related and kelantan_related:
+        print(f"[🔍] Using full claim for Lowyat Forum search: {full_claim}")
+        # Use the full claim as a single search term
+        keywords = [full_claim]
+        # Also add these specialized keywords for better coverage
+        specialized_keywords = [
+            "polis kelantan",
+            "kes rogol kelantan",
+            "sumbang mahram",
+            "jenayah seksual"
+        ]
+        # Add specialized keywords to the search
+        keywords.extend(specialized_keywords)
+        print(f"[🔍] Using keywords: {', '.join(keywords)}")
+    # Use more targeted keywords for crime-related claims in Kelantan (if no full claim)
+    elif crime_related and kelantan_related:
+        print("[🔍] Detected crime-related claim about Kelantan, using specialized keywords")
+        keywords = [
+            "polis kelantan",
+            "kes rogol kelantan",
+            "sumbang mahram",
+            "jenayah seksual"
+        ]
+    # Add context-specific keywords for other types of claims
+    elif full_claim:
+        # Check for economic/financial claims
+        if any(term in full_claim.lower() for term in ["ekonomi", "kewangan", "cukai", "subsidi", "harga"]):
+            print("[🔍] Detected economic/financial claim, adding relevant keywords")
+            econ_keywords = ["ekonomi malaysia", "kewangan", "cukai", "subsidi", "harga"]
+            keywords.extend([k for k in econ_keywords if k not in keywords])
+        # Check for political claims
+        elif any(term in full_claim.lower() for term in ["kerajaan", "politik", "perdana menteri", "kabinet", "parlimen"]):
+            print("[🔍] Detected political claim, adding relevant keywords")
+            pol_keywords = ["kerajaan", "politik malaysia", "dasar", "kabinet"]
+            keywords.extend([k for k in pol_keywords if k not in keywords])
+    # Set default sections if not provided
+    if sections is None:
+        sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
+    # Validate sections against available forum sections
+    valid_sections = [section for section in sections if section in FORUM_SECTIONS]
+    if not valid_sections:
+        print("[⚠️] No valid forum sections provided. Using default sections.")
+        valid_sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
+    # If sections were invalid, inform the user
+    if len(valid_sections) != len(sections):
+        print(f"[⚠️] Some sections were invalid. Using: {', '.join(valid_sections)}")
+    # For crime-related topics, prioritize SeriousKopitiam
+    if crime_related and "SeriousKopitiam" in valid_sections:
+        # Move SeriousKopitiam to the front of the list
+        valid_sections.remove("SeriousKopitiam")
+        valid_sections.insert(0, "SeriousKopitiam")
+    # For economic topics, prioritize Finance
+    elif any(term in "".join(keywords).lower() for term in ["ekonomi", "kewangan", "cukai", "subsidi", "harga"]) and "Finance" in valid_sections:
+        valid_sections.remove("Finance")
+        valid_sections.insert(0, "Finance")
+    # For political topics, prioritize Politics
+    elif any(term in "".join(keywords).lower() for term in ["kerajaan", "politik", "perdana menteri", "kabinet", "parlimen"]) and "Politics" in valid_sections:
+        valid_sections.remove("Politics")
+        valid_sections.insert(0, "Politics")
+    # Search forum with enhanced options
+    threads = search_lowyat_forum(
+        keywords,
+        sections=valid_sections,
+        max_threads=max_threads,
+        verbose=verbose,
+        use_mock_data=use_mock_data
+    )
+    print(f"[✓] Found {len(threads)} threads on Lowyat Forum")
+    # Convert to DataFrame
+    df = convert_to_dataframe(threads)
+    print(f"[✓] Converted to {len(df)} records")
+    # Save to CSV if output path provided
+    if output_path and len(df) > 0:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        df.to_csv(output_path, index=False)
+        print(f"[💾] Saved Lowyat Forum data to {output_path}")
+    elif output_path:
+        # Create an empty CSV file with the correct columns
+        empty_df = pd.DataFrame(columns=[
+            "platform", "date", "username", "post_text", "post_url",
+            "likes", "shares", "comments_count", "comment_text", "combined_text"
+        ])
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        empty_df.to_csv(output_path, index=False)
+        print(f"[💾] Saved empty Lowyat Forum data file to {output_path}")
+    return df
+# Test the crawler if run directly
+if __name__ == "__main__":
+    test_keywords = ["cukai minyak sawit", "palm oil tax"]
+    test_sections = ["Kopitiam", "Finance"]
+    df = run_lowyat_crawler(test_keywords, sections=test_sections, max_threads=10)
+    print(df.head())

ai_api/library/priority_indexer.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# priority_indexer.py
+import pandas as pd
+import json
+import os
+import re
+from datetime import datetime
+def load_agency_keywords(filepath=None):
+    """
+    Load keywords for agency detection or use default keywords if file not found
+    """
+    # Define default agency keywords if file not provided or not found
+    default_keywords = {
+        # Government-related keywords
+        "government": [
+            "kerajaan", "menteri", "perdana menteri", "kementerian", "jabatan",
+            "agensi", "dasar", "parlimen", "dewan rakyat", "dewan negara",
+            "dun", "pejabat", "keselamatan negara", "atm", "polis",
+            "kdn", "hasil", "sop", "ancaman", "pentadbiran", "kabinet",
+            "politik", "ahli parlimen", "wakil rakyat", "adun", "pemimpin",
+            "ketua menteri", "menteri besar", "exco", "majlis", "pihak berkuasa",
+            "pbt", "majlis perbandaran", "majlis bandaraya", "dewan bandaraya"
+        ],
+        # Economic keywords
+        "economic": [
+            "ekonomi", "kewangan", "bank", "cukai", "subsidi", "harga", "kos",
+            "perbelanjaan", "pendapatan", "gaji", "dividen", "saham", "pasaran",
+            "inflasi", "deflasi", "krisis", "kemelesetan", "pertumbuhan", "gdp",
+            "kdnk", "pelaburan", "pelabur", "perniagaan", "syarikat", "industri",
+            "sektor", "perdagangan", "import", "eksport", "mata wang", "ringgit",
+            "dolar", "hutang", "pinjaman", "faedah", "untung", "rugi", "bayaran",
+            "fi", "yuran", "perbelanjaan", "pendapatan", "bonus", "elaun",
+            "insentif", "bantuan", "sumbangan", "derma", "zakat", "duti",
+            "levi", "caj", "jualan", "belian", "pembelian", "perolehan",
+            "tender", "kontrak", "projek", "pembangunan", "infrastruktur",
+            "pembinaan", "hartanah", "rumah", "kediaman", "komersial",
+            "tanah", "saiz", "keluasan", "murah", "mahal", "berpatutan",
+            "mampu", "tidak mampu", "bekalan", "stok", "inventori",
+            "simpanan", "rizab", "aset", "liabiliti", "kredit", "debit",
+            "ansuran", "keuntungan", "kerugian", "defisit", "surplus",
+            "lebihan", "kekurangan", "kenaikan", "penurunan", "peningkatan",
+            "pengurangan", "pemulihan", "pembaikan"
+        ],
+        # Law-related keywords
+        "law": [
+            "undang-undang", "perundangan", "akta", "enakmen", "ordinan",
+            "peraturan", "perlembagaan", "mahkamah", "hakim", "peguam",
+            "pendakwa", "pendakwaan", "pertuduhan", "dakwaan", "saman",
+            "waran", "tangkap", "tahan", "reman", "jamin", "ikat jamin",
+            "denda", "hukuman", "penjara", "polis", "balai", "laporan",
+            "aduan", "siasatan", "siasat", "jenayah", "sivil", "kes",
+            "fail", "bicara", "perbicaraan", "prosiding", "rayuan",
+            "petisyen", "pindaan", "bon", "jaminan", "saksi", "keterangan",
+            "bukti", "forensik", "peguambela", "peguamcara", "pendakwa raya",
+            "majistret", "ketua hakim", "ketua hakim negara", "hakim besar",
+            "mahkamah tinggi", "mahkamah rayuan", "mahkamah persekutuan",
+            "mahkamah rendah", "mahkamah majistret", "mahkamah sesyen",
+            "mahkamah syariah", "pdrm", "ibu pejabat polis", "ketua polis",
+            "pegawai polis", "anggota polis", "konstabel", "koperal",
+            "sarjan", "inspektor", "superintendan", "komisioner", "sprm",
+            "suruhanjaya pencegahan rasuah", "rasuah", "korupsi",
+            "salah guna kuasa", "penyelewengan", "pecah amanah",
+            "pengubahan wang haram"
+        ],
+        # Danger-related keywords
+        "danger": [
+            "bahaya", "merbahaya", "risiko", "ancaman", "bencana", "malapetaka",
+            "tragedi", "musibah", "kemalangan", "nahas", "kecelakaan", "kecederaan",
+            "kematian", "korban", "mangsa", "kemusnahan", "kerosakan", "kerugian",
+            "kehilangan", "kecurian", "rompakan", "samun", "ragut", "pecah",
+            "pecah rumah", "pecah masuk", "curi", "culik", "bunuh", "bunuh diri",
+            "mati", "cedera", "parah", "kritikal", "koma", "luka", "patah",
+            "retak", "lebam", "bengkak", "darah", "pendarahan", "kecemasan",
+            "ambulans", "hospital", "klinik", "doktor", "ubat", "dadah",
+            "narkotik", "ganja", "heroin", "kokain", "syabu", "pil kuda",
+            "ekstasi", "ketamin", "morfin", "ketagihan", "penagih", "pengedar",
+            "sindiket", "kartel", "mafia", "gangster", "kongsi gelap", "geng",
+            "kumpulan jenayah", "penjenayah", "penjahat", "pesalah", "banduan",
+            "tahanan", "suspek", "tertuduh", "terdakwa", "senjata", "pistol",
+            "revolver", "senapang", "rifle", "shotgun", "bom", "granat",
+            "peluru", "kelongsong", "senjata api", "senjata tajam", "pisau",
+            "parang", "kapak", "keris", "pedang", "racun", "toksin", "kimia",
+            "biologi", "nuklear", "radiasi", "sinaran", "letupan", "ledakan",
+            "kebakaran", "api", "nyalaan", "bara", "asap", "hangus", "terbakar",
+            "banjir", "bah", "limpahan", "hujan", "ribut", "taufan", "siklon",
+            "hurikan", "tornado", "puting beliung", "angin kencang", "kilat",
+            "petir", "guruh", "guntur", "halilintar", "tanah runtuh", "gelinciran tanah",
+            "runtuhan", "runtuh", "jatuh", "roboh", "rebah", "tumbang", "gempa",
+            "gempa bumi", "tsunami", "ombak besar", "gelombang tinggi", "kemarau",
+            "kekeringan", "perang", "pertempuran", "pergaduhan", "perkelahian",
+            "rusuhan", "kekacauan", "huru-hara", "keganasan", "kekerasan",
+            "keselamatan", "keselamatan negara", "keselamatan awam", "kanser",
+            "barah", "tumor", "penyakit", "wabak", "epidemik", "pandemik",
+            "jangkitan", "virus", "bakteria", "nyawa", "terancam", "maut"
+        ]
+    }
+    # Try to load from file if provided
+    if filepath and os.path.exists(filepath):
+        try:
+            df = pd.read_csv(filepath)
+            if 'keyword' in df.columns and 'category' in df.columns:
+                # Group keywords by category
+                keywords = {}
+                for category in df['category'].unique():
+                    keywords[category] = df[df['category'] == category]['keyword'].tolist()
+                return keywords
+            else:
+                print(f"[⚠️] Warning: Required columns not found in {filepath}. Using default keywords.")
+                return default_keywords
+        except Exception as e:
+            print(f"[⚠️] Error loading agency keywords from {filepath}: {e}")
+            return default_keywords
+    else:
+        if filepath:
+            print(f"[ℹ️] Agency keywords file not found. Using default keywords.")
+        return default_keywords
+def analyze_text_content(df, keywords_dict):
+    """
+    Analyze text content in the dataframe to find keywords
+    Returns a dictionary of found keywords by category
+    """
+    found_keywords = {category: [] for category in keywords_dict.keys()}
+    # Combine all text columns
+    text_columns = ['post_text', 'comment_text', 'title', 'snippet', 'combined_text']
+    all_text = ""
+    for col in text_columns:
+        if col in df.columns:
+            all_text += " " + " ".join(df[col].fillna("").astype(str))
+    all_text = all_text.lower()
+    # Search for keywords in the combined text
+    for category, keywords in keywords_dict.items():
+        for keyword in keywords:
+            if keyword.lower() in all_text:
+                found_keywords[category].append(keyword)
+    # Remove duplicates and limit to top 5 per category
+    for category in found_keywords:
+        found_keywords[category] = list(set(found_keywords[category]))[:5]
+    return found_keywords
+def calculate_priority_score(flags):
+    """Calculate priority score based on flags"""
+    # Base weights for different flags
+    weights = {
+        "fact_check_value": 1.0,
+        "cause_confusion": 1.5,
+        "cause_chaos": 1.8,
+        "affects_government": 1.0,
+        "economic_impact": 0.8,
+        "law_related": 0.8,
+        "public_interest": 1.2,
+        "lives_in_danger": 1.5,
+        "viral": 1.0,
+        "urgent": 2.0
+    }
+    # Calculate weighted score
+    score = 0
+    for flag, value in flags.items():
+        if flag in weights and value == 1:
+            score += weights[flag]
+    # Normalize to 0-10 scale
+    max_possible_score = sum(weights.values())
+    normalized_score = (score / max_possible_score) * 10
+    # Cap at 10
+    return min(normalized_score, 10.0)
+def get_priority_level(score):
+    """Get priority level based on score"""
+    if score >= 8.0:
+        return "TINGGI"
+    elif score >= 5.0:
+        return "SEDERHANA"
+    else:
+        return "RENDAH"
+def run(sentiment_csv, agencies_csv=None, output_path=None, claim=None, claim_id=None, keywords=None):
+    """
+    Run priority indexing on sentiment data
+    Args:
+        sentiment_csv (str): Path to sentiment CSV file
+        agencies_csv (str, optional): Path to agencies CSV file
+        output_path (str, optional): Path to output JSON file
+        claim (str, optional): The claim text
+        claim_id (str, optional): Unique identifier for the claim
+        keywords (list, optional): List of keywords
+    Returns:
+        dict: Priority report data
+    """
+    print(f"[🔍] Loading sentiment data from: {sentiment_csv}")
+    try:
+        df = pd.read_csv(sentiment_csv)
+    except Exception as e:
+        print(f"[❌] Error reading sentiment data: {e}")
+        return None
+    # Load agency keywords
+    agency_keywords = load_agency_keywords(agencies_csv)
+    # Initialize flags
+    flags = {
+        "fact_check_value": 0,
+        "cause_confusion": 0,
+        "cause_chaos": 0,
+        "affects_government": 0,
+        "economic_impact": 0,
+        "law_related": 0,
+        "public_interest": 0,
+        "lives_in_danger": 0,
+        "viral": 0,
+        "urgent": 0
+    }
+    # Calculate sentiment counts
+    sentiment_counts = df['sentiment'].value_counts().to_dict()
+    # Convert numeric sentiments to text
+    sentiment_map = {0: "neutral", 1: "positive", 2: "negative"}
+    text_counts = {}
+    for k, v in sentiment_counts.items():
+        if k in sentiment_map:
+            text_counts[sentiment_map[k]] = v
+        else:
+            text_counts[k] = v
+    # Get total records
+    total_records = len(df)
+    # Calculate engagement metrics
+    total_likes = df['likes'].sum() if 'likes' in df.columns else 0
+    total_shares = df['shares'].sum() if 'shares' in df.columns else 0
+    total_comments = df['comments'].sum() if 'comments' in df.columns else 0
+    total_views = df['views'].sum() if 'views' in df.columns else 0
+    total_engagement = total_likes + total_shares + total_comments + total_views
+    # Check fact_check_value flag (based on engagement)
+    # Rule: High engagement indicates need for fact checking
+    if total_engagement > 10000:
+        flags["fact_check_value"] = 1
+        print(f"[📊] Flag: fact_check_value triggered (Total engagement: {total_engagement})")
+    # Check sentiment-based flags
+    pos = text_counts.get("positive", 0)
+    neg = text_counts.get("negative", 0)
+    neu = text_counts.get("neutral", 0)
+    total_sentiment = pos + neg + neu
+    if total_sentiment > 0:
+        pos_ratio = pos / total_sentiment
+        neg_ratio = neg / total_sentiment
+        neu_ratio = neu / total_sentiment
+        # Rule: cause_confusion if positive = negative OR neutral is high
+        if (abs(pos_ratio - neg_ratio) < 0.2 and pos_ratio > 0.2 and neg_ratio > 0.2) or (neu_ratio > 0.7):
+            flags["cause_confusion"] = 1
+            print(f"[📊] Flag: cause_confusion triggered (Pos: {pos_ratio:.2f}, Neg: {neg_ratio:.2f}, Neu: {neu_ratio:.2f})")
+        # Rule: cause_chaos if negative sentiment is high
+        if neg_ratio > 0.4:
+            flags["cause_chaos"] = 1
+            print(f"[📊] Flag: cause_chaos triggered (Negative: {neg_ratio:.2f})")
+    # Analyze text content for keywords
+    found_keywords = analyze_text_content(df, agency_keywords)
+    # Check government-related flag
+    # Rule: Contains government-related keywords
+    if found_keywords["government"]:
+        flags["affects_government"] = 1
+        print(f"[📊] Flag: affects_government triggered (Gov terms: {', '.join(found_keywords['government'])})")
+    # Check economic impact flag
+    # Rule: Contains economic-related keywords
+    if found_keywords["economic"]:
+        flags["economic_impact"] = 1
+        print(f"[📊] Flag: economic_impact triggered (Economic terms: {', '.join(found_keywords['economic'])})")
+    # Check law-related flag
+    # Rule: Contains law-related keywords
+    if found_keywords["law"]:
+        flags["law_related"] = 1
+        print(f"[📊] Flag: law_related triggered (Law terms: {', '.join(found_keywords['law'])})")
+    # Check public interest flag
+    # Rule: High comments and shares indicate public interest
+    if (total_comments + total_shares) > 1000:
+        flags["public_interest"] = 1
+        print(f"[📊] Flag: public_interest triggered (Comments + Shares: {total_comments + total_shares})")
+    # Check danger-related flag
+    # Rule: Contains danger-related keywords
+    if found_keywords["danger"]:
+        flags["lives_in_danger"] = 1
+        print(f"[📊] Flag: lives_in_danger triggered (Danger terms: {', '.join(found_keywords['danger'])})")
+    # Check viral flag
+    # Rule: High shares indicate virality
+    if total_shares > 1000:
+        flags["viral"] = 1
+        print(f"[📊] Flag: viral triggered (Total shares: {total_shares})")
+    # Check urgent flag
+    # Rule: If 5 or more flags are triggered, it's urgent
+    flags_triggered = sum(flags.values())
+    if flags_triggered >= 5:
+        flags["urgent"] = 1
+        print(f"[📊] Flag: urgent triggered ({flags_triggered} flags triggered)")
+    # Calculate priority score
+    priority_score = calculate_priority_score(flags)
+    priority_level = get_priority_level(priority_score)
+    # Prepare report data
+    report_data = {
+        "priority_flags": flags,
+        "priority_score": priority_score,
+        "priority_level": priority_level,
+        "sentiment_counts": text_counts,
+        "total_records": total_records,
+        "engagement": {
+            "likes": int(total_likes),
+            "shares": int(total_shares),
+            "comments": int(total_comments),
+            "views": int(total_views),
+            "total": int(total_engagement)
+        },
+        "found_keywords": found_keywords,
+        "claim": claim,
+        "keywords": keywords,
+        "timestamp": datetime.now().isoformat()
+    }
+    # Ensure output directory exists
+    if not output_path:
+        output_path = os.path.join("reports", os.path.basename(sentiment_csv).replace("_sentiment.csv", "_priority.json"))
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(report_data, f, indent=4)
+    print(f"[📊] Priority index saved to {output_path}")
+    print(f"[📊] Priority score: {priority_score:.2f}/10 ({priority_level})")
+    return report_data

ai_api/library/sentiment_analyzer.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# sentiment_analyzer.py
+# Simple sentiment analyzer that doesn't require PyTorch
+import pandas as pd
+import re
+import random
+import os
+def simple_sentiment_analysis(text):
+    """
+    A very simple rule-based sentiment analyzer for demonstration purposes.
+    Returns a sentiment label (neutral, positive, negative) and confidence score.
+    """
+    if not text or len(text.strip()) < 15:
+        return "neutral", 0.5
+    # Convert to lowercase
+    text = text.lower()
+    # Define positive and negative word lists (Malay and English)
+    positive_words = [
+        "baik", "bagus", "hebat", "cantik", "indah", "suka", "gembira", "senang",
+        "setuju", "betul", "benar", "berkesan", "berjaya", "cemerlang", "positif",
+        "good", "great", "excellent", "amazing", "wonderful", "happy", "like", "love",
+        "agree", "correct", "true", "effective", "successful", "positive"
+    ]
+    negative_words = [
+        "buruk", "teruk", "hodoh", "benci", "marah", "sedih", "kecewa", "susah",
+        "tidak setuju", "salah", "palsu", "gagal", "negatif", "masalah", "bahaya",
+        "bad", "terrible", "ugly", "hate", "angry", "sad", "disappointed", "difficult",
+        "disagree", "wrong", "false", "fail", "negative", "problem", "dangerous"
+    ]
+    # Count positive and negative words
+    positive_count = sum(1 for word in positive_words if re.search(r'\b' + re.escape(word) + r'\b', text))
+    negative_count = sum(1 for word in negative_words if re.search(r'\b' + re.escape(word) + r'\b', text))
+    # Determine sentiment
+    if positive_count > negative_count:
+        sentiment = "positive"
+        confidence = 0.5 + min(0.5, (positive_count - negative_count) / 10)
+    elif negative_count > positive_count:
+        sentiment = "negative"
+        confidence = 0.5 + min(0.5, (negative_count - positive_count) / 10)
+    else:
+        sentiment = "neutral"
+        confidence = 0.5
+    return sentiment, round(confidence, 4)
+def run(csv_path, sentiment_output_path=None):
+    """
+    Runs sentiment analysis on combined comment + post text from the input CSV.
+    Saves the result (with sentiment + confidence columns) to a new CSV.
+    """
+    print(f"[📄] Reading dataset: {csv_path}")
+    df = pd.read_csv(csv_path)
+    # Combine comment and post text into a single field
+    df['combined_text'] = df['comment_text'].fillna('') + ". " + df['post_text'].fillna('')
+    df['combined_text'] = df['combined_text'].str.strip()
+    sentiments = []
+    confidences = []
+    print("[🔍] Running simple sentiment classification...")
+    for text in df['combined_text']:
+        sentiment, confidence = simple_sentiment_analysis(text)
+        sentiments.append(sentiment)
+        confidences.append(confidence)
+    # Add results to DataFrame
+    df['sentiment'] = sentiments
+    df['confidence'] = confidences
+    # Map sentiments to numeric values for compatibility with the rest of the system
+    sentiment_map = {
+        "neutral": 0,
+        "positive": 1,
+        "negative": 2
+    }
+    df['sentiment_value'] = df['sentiment'].map(sentiment_map)
+    # Determine the output path dynamically if not provided
+    if not sentiment_output_path:
+        sentiment_output_path = csv_path.replace(".csv", "_sentiment.csv")
+    df.to_csv(sentiment_output_path, index=False)
+    print(f"[💾] Sentiment analysis completed. Output saved to: {sentiment_output_path}")

ai_api/library/simple_keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# simple_keyword_extraction.py
+# Simple keyword extraction for the claim analysis system
+import re
+from collections import Counter
+# Define Malay stopwords
+MALAY_STOPWORDS = [
+    "ada", "adalah", "adanya", "adapun", "agak", "agaknya", "agar", "akan", "akankah", "akhir",
+    "akhiri", "akhirnya", "aku", "akulah", "amat", "amatlah", "anda", "andalah", "antar", "antara",
+    "antaranya", "apa", "apaan", "apabila", "apakah", "apalagi", "apatah", "artinya", "asal", "asalkan",
+    "atas", "atau", "ataukah", "ataupun", "awal", "awalnya", "bagai", "bagaikan", "bagaimana", "bagaimanakah",
+    "bagaimanapun", "bagi", "bagian", "bahkan", "bahwa", "bahwasanya", "baik", "bakal", "bakalan", "balik",
+    "banyak", "bapak", "baru", "bawah", "beberapa", "begini", "beginian", "beginikah", "beginilah", "begitu",
+    "begitukah", "begitulah", "begitupun", "bekerja", "belakang", "belakangan", "belum", "belumlah", "benar",
+    "benarkah", "benarlah", "berada", "berakhir", "berakhirlah", "berakhirnya", "berapa", "berapakah", "berapalah",
+    "berapapun", "berarti", "berawal", "berbagai", "berdatangan", "beri", "berikan", "berikut", "berikutnya",
+    "berjumlah", "berkali-kali", "berkata", "berkehendak", "berkeinginan", "berkenaan", "berlainan", "berlalu",
+    "berlangsung", "berlebihan", "bermacam", "bermacam-macam", "bermaksud", "bermula", "bersama", "bersama-sama",
+    "bersiap", "bersiap-siap", "bertanya", "bertanya-tanya", "berturut", "berturut-turut", "bertutur", "berujar",
+    "berupa", "besar", "betul", "betulkah", "biasa", "biasanya", "bila", "bilakah", "bisa", "bisakah", "boleh",
+    "bolehkah", "bolehlah", "buat", "bukan", "bukankah", "bukanlah", "bukannya", "bulan", "bung", "cara", "caranya",
+    "cukup", "cukupkah", "cukuplah", "cuma", "dahulu", "dalam", "dan", "dapat", "dari", "daripada", "datang",
+    "dekat", "demi", "demikian", "demikianlah", "dengan", "depan", "di", "dia", "diakhiri", "diakhirinya", "dialah",
+    "diantara", "diantaranya", "diberi", "diberikan", "diberikannya", "dibuat", "dibuatnya", "didapat", "didatangkan",
+    "digunakan", "diibaratkan", "diibaratkannya", "diingat", "diingatkan", "diinginkan", "dijawab", "dijelaskan",
+    "dijelaskannya", "dikarenakan", "dikatakan", "dikatakannya", "dikerjakan", "diketahui", "diketahuinya", "dikira",
+    "dilakukan", "dilalui", "dilihat", "dimaksud", "dimaksudkan", "dimaksudkannya", "dimaksudnya", "diminta",
+    "dimintai", "dimisalkan", "dimulai", "dimulailah", "dimulainya", "dimungkinkan", "dini", "dipastikan",
+    "diperbuat", "diperbuatnya", "dipergunakan", "diperkirakan", "diperlihatkan", "diperlukan", "diperlukannya",
+    "dipersoalkan", "dipertanyakan", "dipunyai", "diri", "dirinya", "disampaikan", "disebut", "disebutkan",
+    "disebutkannya", "disini", "disinilah", "ditambahkan", "ditandaskan", "ditanya", "ditanyai", "ditanyakan",
+    "ditegaskan", "ditujukan", "ditunjuk", "ditunjuki", "ditunjukkan", "ditunjukkannya", "ditunjuknya", "dituturkan",
+    "dituturkannya", "diucapkan", "diucapkannya", "diungkapkan", "dong", "dua", "dulu", "empat", "enggak", "enggaknya",
+    "entah", "entahlah", "guna", "gunakan", "hal", "hampir", "hanya", "hanyalah", "hari", "harus", "haruslah",
+    "harusnya", "hendak", "hendaklah", "hendaknya", "hingga", "ia", "ialah", "ibarat", "ibaratkan", "ibaratnya",
+    "ibu", "ikut", "ingat", "ingat-ingat", "ingin", "inginkah", "inginkan", "ini", "inikah", "inilah", "itu",
+    "itukah", "itulah", "jadi", "jadilah", "jadinya", "jangan", "jangankan", "janganlah", "jauh", "jawab",
+    "jawaban", "jawabnya", "jelas", "jelaskan", "jelaslah", "jelasnya", "jika", "jikalau", "juga", "jumlah",
+    "jumlahnya", "justru", "kala", "kalau", "kalaulah", "kalaupun", "kalian", "kami", "kamilah", "kamu", "kamulah",
+    "kan", "kapan", "kapankah", "kapanpun", "karena", "karenanya", "kasus", "kata", "katakan", "katakanlah",
+    "katanya", "ke", "keadaan", "kebetulan", "kecil", "kedua", "keduanya", "keinginan", "kelamaan", "kelihatan",
+    "kelihatannya", "kelima", "keluar", "kembali", "kemudian", "kemungkinan", "kemungkinannya", "kenapa", "kepada",
+    "kepadanya", "kesamaan", "keseluruhan", "keseluruhannya", "keterlaluan", "ketika", "khususnya", "kini", "kinilah",
+    "kira", "kira-kira", "kiranya", "kita", "kitalah", "kok", "kurang", "lagi", "lagian", "lah", "lain", "lainnya",
+    "lalu", "lama", "lamanya", "lanjut", "lanjutnya", "lebih", "lewat", "lima", "luar", "macam", "maka", "makanya",
+    "makin", "malah", "malahan", "mampu", "mampukah", "mana", "manakala", "manalagi", "masa", "masalah", "masalahnya",
+    "masih", "masihkah", "masing", "masing-masing", "mau", "maupun", "melainkan", "melakukan", "melalui", "melihat",
+    "melihatnya", "memang", "memastikan", "memberi", "memberikan", "membuat", "memerlukan", "memihak", "meminta",
+    "memintakan", "memisalkan", "memperbuat", "mempergunakan", "memperkirakan", "memperlihatkan", "mempersiapkan",
+    "mempersoalkan", "mempertanyakan", "mempunyai", "memulai", "memungkinkan", "menaiki", "menambahkan", "menandaskan",
+    "menanti", "menanti-nanti", "menantikan", "menanya", "menanyai", "menanyakan", "mendapat", "mendapatkan",
+    "mendatang", "mendatangi", "mendatangkan", "menegaskan", "mengakhiri", "mengapa", "mengatakan", "mengatakannya",
+    "mengenai", "mengerjakan", "mengetahui", "menggunakan", "menghendaki", "mengibaratkan", "mengibaratkannya",
+    "mengingat", "mengingatkan", "menginginkan", "mengira", "mengucapkan", "mengucapkannya", "mengungkapkan",
+    "menjadi", "menjawab", "menjelaskan", "menuju", "menunjuk", "menunjuki", "menunjukkan", "menunjuknya", "menurut",
+    "menuturkan", "menyampaikan", "menyangkut", "menyatakan", "menyebutkan", "menyeluruh", "menyiapkan", "merasa",
+    "mereka", "merekalah", "merupakan", "meski", "meskipun", "meyakini", "meyakinkan", "minta", "mirip", "misal",
+    "misalkan", "misalnya", "mula", "mulai", "mulailah", "mulanya", "mungkin", "mungkinkah", "nah", "naik", "namun",
+    "nanti", "nantinya", "nyaris", "nyatanya", "oleh", "olehnya", "pada", "padahal", "padanya", "pak", "paling",
+    "panjang", "pantas", "para", "pasti", "pastilah", "penting", "pentingnya", "per", "percuma", "perlu", "perlukah",
+    "perlunya", "pernah", "persoalan", "pertama", "pertama-tama", "pertanyaan", "pertanyakan", "pihak", "pihaknya",
+    "pukul", "pula", "pun", "punya", "rasa", "rasanya", "rata", "rupanya", "saat", "saatnya", "saja", "sajalah",
+    "saling", "sama", "sama-sama", "sambil", "sampai", "sampai-sampai", "sampaikan", "sana", "sangat", "sangatlah",
+    "satu", "saya", "sayalah", "se", "sebab", "sebabnya", "sebagai", "sebagaimana", "sebagainya", "sebagian",
+    "sebaik", "sebaik-baiknya", "sebaiknya", "sebaliknya", "sebanyak", "sebegini", "sebegitu", "sebelum", "sebelumnya",
+    "sebenarnya", "seberapa", "sebesar", "sebetulnya", "sebisanya", "sebuah", "sebut", "sebutlah", "sebutnya",
+    "secara", "secukupnya", "sedang", "sedangkan", "sedemikian", "sedikit", "sedikitnya", "seenaknya", "segala",
+    "segalanya", "segera", "seharusnya", "sehingga", "seingat", "sejak", "sejauh", "sejenak", "sejumlah", "sekadar",
+    "sekadarnya", "sekali", "sekali-kali", "sekalian", "sekaligus", "sekalipun", "sekarang", "sekarang", "sekecil",
+    "seketika", "sekiranya", "sekitar", "sekitarnya", "sekurang-kurangnya", "sekurangnya", "sela", "selain", "selaku",
+    "selalu", "selama", "selama-lamanya", "selamanya", "selanjutnya", "seluruh", "seluruhnya", "semacam", "semakin",
+    "semampu", "semampunya", "semasa", "semasih", "semata", "semata-mata", "semaunya", "sementara", "semisal",
+    "semisalnya", "sempat", "semua", "semuanya", "semula", "sendiri", "sendirian", "sendirinya", "seolah",
+    "seolah-olah", "seorang", "sepanjang", "sepantasnya", "sepantasnyalah", "seperlunya", "seperti", "sepertinya",
+    "sepihak", "sering", "seringnya", "serta", "serupa", "sesaat", "sesama", "sesampai", "sesegera", "sesekali",
+    "seseorang", "sesuatu", "sesuatunya", "sesudah", "sesudahnya", "setelah", "setempat", "setengah", "seterusnya",
+    "setiap", "setiba", "setibanya", "setidak-tidaknya", "setidaknya", "setinggi", "seusai", "sewaktu", "siap",
+    "siapa", "siapakah", "siapapun", "sini", "sinilah", "soal", "soalnya", "suatu", "sudah", "sudahkah", "sudahlah",
+    "supaya", "tadi", "tadinya", "tahu", "tahun", "tak", "tambah", "tambahnya", "tampak", "tampaknya", "tandas",
+    "tandasnya", "tanpa", "tanya", "tanyakan", "tanyanya", "tapi", "tegas", "tegasnya", "telah", "tempat", "tengah",
+    "tentang", "tentu", "tentulah", "tentunya", "tepat", "terakhir", "terasa", "terbanyak", "terdahulu", "terdapat",
+    "terdiri", "terhadap", "terhadapnya", "teringat", "teringat-ingat", "terjadi", "terjadilah", "terjadinya",
+    "terkira", "terlalu", "terlebih", "terlihat", "termasuk", "ternyata", "tersampaikan", "tersebut", "tersebutlah",
+    "tertentu", "tertuju", "terus", "terutama", "tetap", "tetapi", "tiap", "tiba", "tiba-tiba", "tidak", "tidakkah",
+    "tidaklah", "tiga", "tinggi", "toh", "tunjuk", "turut", "tutur", "tuturnya", "ucap", "ucapnya", "ujar", "ujarnya",
+    "umum", "umumnya", "ungkap", "ungkapnya", "untuk", "usah", "usai", "waduh", "wah", "wahai", "waktu", "waktunya",
+    "walau", "walaupun", "wong", "yaitu", "yakin", "yakni", "yang", "ke", "pada", "ini", "itu", "juga", "dari", "dalam",
+    "akan", "jika", "maka", "karena", "oleh", "dengan", "atau", "secara", "untuk", "adalah", "sebagai", "bahwa", "hanya",
+    "namun", "tetapi", "ketika", "setelah", "sebelum", "selama", "sejak", "hingga", "sampai", "tentang", "seperti",
+    "terhadap", "melalui", "menurut", "berdasarkan", "mengenai", "antara", "di", "si", "sang", "para", "the", "of", "and",
+    "a", "to", "in", "that", "it", "with", "as", "for", "on", "was", "is", "by", "at", "this", "an", "are", "not", "from",
+    "but", "have", "had", "has", "be", "been", "were", "which", "or", "we", "their", "his", "her", "they", "its", "he",
+    "she", "you", "my", "all", "can", "would", "could", "should", "may", "might", "must", "shall", "will", "them", "there",
+    "these", "those", "some", "any", "no", "nor", "so", "such", "than", "then", "thus", "up", "down", "out", "about", "into",
+    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "what", "who",
+    "whom", "this", "that", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
+    "does", "did", "doing", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
+    "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them",
+    "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
+    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the",
+    "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
+    "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out",
+    "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all",
+    "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
+    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
+]
+def extract_keywords(text, top_n=10):
+    """
+    Extract keywords from text using a simple frequency-based approach
+    Args:
+        text (str): Text to extract keywords from
+        top_n (int): Number of keywords to extract
+    Returns:
+        list: List of extracted keywords
+    """
+    # Convert to lowercase
+    text = text.lower()
+    # Remove punctuation and split into words
+    words = re.findall(r'\b\w+\b', text)
+    # Remove stopwords
+    words = [word for word in words if word not in MALAY_STOPWORDS and len(word) > 2]
+    # Count word frequencies
+    word_counts = Counter(words)
+    # Get top N keywords
+    keywords = [word for word, count in word_counts.most_common(top_n)]
+    # If we have fewer than top_n keywords, return what we have
+    return keywords
+def optimize_keywords_for_platforms(keywords):
+    """
+    Optimize keywords for different platforms
+    Args:
+        keywords (list): List of keywords
+    Returns:
+        dict: Dictionary with optimized keywords for each platform
+    """
+    return {
+        "tiktok": keywords[:3],
+        "web_search": keywords[:5]
+    }
+def detect_claim_type(text):
+    """
+    Detect the type of claim based on keywords
+    Args:
+        text (str): The claim text
+    Returns:
+        str: The type of claim
+    """
+    text = text.lower()
+    # Define keyword sets for different claim types
+    economic_keywords = ["ekonomi", "cukai", "harga", "kewangan", "bank", "ringgit", "subsidi", "kos", "bayaran", "hutang"]
+    political_keywords = ["kerajaan", "politik", "perdana menteri", "menteri", "parlimen", "pilihan raya", "parti", "kabinet"]
+    health_keywords = ["kesihatan", "penyakit", "hospital", "vaksin", "ubat", "doktor", "covid", "virus", "pandemik"]
+    social_keywords = ["sosial", "masyarakat", "pendidikan", "sekolah", "universiti", "pelajar", "guru", "agama"]
+    security_keywords = ["keselamatan", "polis", "tentera", "jenayah", "penjenayah", "senjata", "serangan"]
+    # Count matches for each category
+    economic_count = sum(1 for keyword in economic_keywords if keyword in text)
+    political_count = sum(1 for keyword in political_keywords if keyword in text)
+    health_count = sum(1 for keyword in health_keywords if keyword in text)
+    social_count = sum(1 for keyword in social_keywords if keyword in text)
+    security_count = sum(1 for keyword in security_keywords if keyword in text)
+    # Determine the dominant category
+    counts = {
+        "Ekonomi": economic_count,
+        "Politik": political_count,
+        "Kesihatan": health_count,
+        "Sosial": social_count,
+        "Keselamatan": security_count
+    }
+    # Get the category with the highest count
+    dominant_category = max(counts, key=counts.get)
+    # If no matches, return "Umum"
+    if counts[dominant_category] == 0:
+        return "Umum"
+    return dominant_category
+if __name__ == "__main__":
+    # Test the function
+    test_text = "Perkenal Cukai Khas Minyak Sawit Mentah Adalah Cadangan Sebuah Persatuan, Bukannya Kerajaan"
+    keywords = extract_keywords(test_text)
+    print(f"Extracted keywords: {keywords}")
+    optimized = optimize_keywords_for_platforms(keywords)
+    print(f"Optimized for TikTok: {optimized['tiktok']}")
+    print(f"Optimized for web search: {optimized['web_search']}")

ai_api/library/websearch.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+run_web_search.py
+Module for running web searches and saving results
+"""
+import pandas as pd
+from datetime import datetime
+import os
+def run(keywords, output_path, num_results=5, use_serpapi=True, use_serper=True, use_duckduckgo=True, full_claim=None):
+    """
+    Run web search for keywords and save results to CSV
+    Args:
+        keywords (list): List of keywords to search for
+        output_path (str): Path to save results
+        num_results (int): Number of results per keyword
+        use_serpapi (bool): Whether to use SerpApi
+        use_serper (bool): Whether to use Serper.dev
+        use_duckduckgo (bool): Whether to use DuckDuckGo
+        full_claim (str): The full claim text to use as a search query
+    Returns:
+        int: Number of results saved
+    """
+    # Import search functions
+    try:
+        from web_search import search_serpapi, search_serper, search_duckduckgo, get_google_trends
+    except ImportError:
+        print("Error importing web_search module. Make sure it exists and is accessible.")
+        return 0
+    # Create search queries
+    all_results = []
+    # Always use the full claim directly if available
+    if full_claim:
+        print(f"Using full claim as direct search query: '{full_claim}'")
+        # Search using SerpApi with the exact claim
+        if use_serpapi:
+            print("Searching with SerpApi (exact claim)...")
+            serpapi_results = search_serpapi(full_claim, num_results=num_results)
+            if serpapi_results:
+                print(f"Found {len(serpapi_results)} results from SerpApi (exact claim)")
+                all_results.extend(serpapi_results)
+            else:
+                print("No results from SerpApi (exact claim)")
+        # Search using Serper.dev with the exact claim
+        if use_serper:
+            print("Searching with Serper.dev (exact claim)...")
+            serper_results = search_serper(full_claim, num_results=num_results)
+            if serper_results:
+                print(f"Found {len(serper_results)} results from Serper.dev (exact claim)")
+                all_results.extend(serper_results)
+            else:
+                print("No results from Serper.dev (exact claim)")
+        # For crime-related claims, also try targeted queries
+        crime_related = any(term in full_claim.lower() for term in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"])
+        kelantan_related = "kelantan" in full_claim.lower()
+        if crime_related and kelantan_related:
+            # Check if this is about sexual crimes or ammunition
+            ammunition_related = any(term in full_claim.lower() for term in ["kelongsong", "peluru", "senjata", "tan"])
+            if ammunition_related:
+                targeted_queries = [
+                    "50 tan kelongsong peluru ditemui",
+                    "kilang haram proses kelongsong peluru",
+                    "penemuan kelongsong peluru di kilang",
+                    "kelongsong peluru musuh negara"
+                ]
+            else:
+                # Default to sexual crime queries
+                targeted_queries = [
+                    "statistik jenayah seksual di kelantan",
+                    "kes rogol dan sumbang mahram di kelantan meningkat",
+                    "pdrm kelantan lapor kes rogol"
+                ]
+            for query in targeted_queries:
+                print(f"Using targeted query: '{query}'")
+                # Search using SerpApi
+                if use_serpapi:
+                    print(f"Searching with SerpApi (targeted query: {query})...")
+                    serpapi_results = search_serpapi(query, num_results=num_results//2)  # Use fewer results for each targeted query
+                    if serpapi_results:
+                        print(f"Found {len(serpapi_results)} results from SerpApi (targeted query)")
+                        all_results.extend(serpapi_results)
+                    else:
+                        print(f"No results from SerpApi (targeted query: {query})")
+                # Search using Serper.dev
+                if use_serper:
+                    print(f"Searching with Serper.dev (targeted query: {query})...")
+                    serper_results = search_serper(query, num_results=num_results//2)  # Use fewer results for each targeted query
+                    if serper_results:
+                        print(f"Found {len(serper_results)} results from Serper.dev (targeted query)")
+                        all_results.extend(serper_results)
+                    else:
+                        print(f"No results from Serper.dev (targeted query: {query})")
+    else:
+        # For other claims, use the original approach with keywords
+        # 1. Full claim query (if available)
+        full_claim_query = f'"{full_claim}"' if full_claim else None
+        # 2. Keyword-based query
+        search_terms = []
+        for kw in keywords:
+            # If keyword contains spaces (multi-word phrase), wrap in quotes
+            if " " in kw:
+                search_terms.append(f'"{kw}"')
+            else:
+                # For single words, don't use quotes to get broader results
+                search_terms.append(kw)
+        keyword_query = " OR ".join(search_terms)
+        # Search using full claim first (if available)
+        if full_claim_query:
+            print(f"Searching with full claim: {full_claim_query}")
+            # Search using SerpApi
+            if use_serpapi:
+                print("Searching with SerpApi (full claim)...")
+                serpapi_results = search_serpapi(full_claim, num_results=num_results)
+                if serpapi_results:
+                    print(f"Found {len(serpapi_results)} results from SerpApi (full claim)")
+                    all_results.extend(serpapi_results)
+                else:
+                    print("No results from SerpApi (full claim)")
+            # Search using Serper.dev
+            if use_serper:
+                print("Searching with Serper.dev (full claim)...")
+                serper_results = search_serper(full_claim, num_results=num_results)
+                if serper_results:
+                    print(f"Found {len(serper_results)} results from Serper.dev (full claim)")
+                    all_results.extend(serper_results)
+                else:
+                    print("No results from Serper.dev (full claim)")
+        # Search using keyword query as fallback
+        if not all_results or len(all_results) < num_results:
+            print(f"Searching with keyword query: {keyword_query}")
+            # Search using SerpApi
+            if use_serpapi:
+                print("Searching with SerpApi (keywords)...")
+                serpapi_results = search_serpapi(keyword_query, num_results=num_results)
+                if serpapi_results:
+                    print(f"Found {len(serpapi_results)} results from SerpApi (keywords)")
+                    all_results.extend(serpapi_results)
+                else:
+                    print("No results from SerpApi (keywords)")
+            # Search using Serper.dev
+            if use_serper:
+                print("Searching with Serper.dev (keywords)...")
+                serper_results = search_serper(keyword_query, num_results=num_results)
+                if serper_results:
+                    print(f"Found {len(serper_results)} results from Serper.dev (keywords)")
+                    all_results.extend(serper_results)
+                else:
+                    print("No results from Serper.dev (keywords)")
+    # Add DuckDuckGo results
+    if use_duckduckgo:
+        query_to_use = full_claim if full_claim else keyword_query
+        print(f"Searching with DuckDuckGo using: {query_to_use}")
+        duckduckgo_results = search_duckduckgo(query_to_use, num_results=num_results)
+        if duckduckgo_results:
+            print(f"Found {len(duckduckgo_results)} results from DuckDuckGo")
+            all_results.extend(duckduckgo_results)
+        else:
+            print("No results from DuckDuckGo")
+    # Add Google Trends data
+    trends_data = get_google_trends(keywords)
+    # Convert to DataFrame
+    if all_results:
+        # Remove duplicates based on URL
+        unique_results = []
+        seen_urls = set()
+        for result in all_results:
+            url = result.get('link', '')
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                unique_results.append(result)
+        print(f"Removed {len(all_results) - len(unique_results)} duplicate results")
+        df = pd.DataFrame(unique_results)
+        # Add additional columns to match the format expected by the sentiment analyzer
+        df['platform'] = 'web'
+        df['username'] = df['source']
+        df['post_text'] = df['snippet']
+        df['post_url'] = df['link']
+        df['likes'] = 0
+        df['shares'] = 0
+        df['comments_count'] = 0
+        df['comment_text'] = ''
+        df['combined_text'] = df['title'] + ' ' + df['snippet']
+        df['date'] = datetime.now().strftime('%Y-%m-%d')
+        # Create output directory if it doesn't exist
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Save to CSV
+        df.to_csv(output_path, index=False)
+        print(f"Saved {len(df)} web search results to {output_path}")
+        return len(df)
+    else:
+        print("No web search results found")
+        return 0
+# Test the module
+if __name__ == "__main__":
+    import sys
+    # Get keywords from command line or use default
+    if len(sys.argv) > 1:
+        keywords = sys.argv[1:]
+        full_claim = " ".join(sys.argv[1:])
+    else:
+        keywords = ["polis", "kelantan", "sumbang mahram", "rogol"]
+        full_claim = "Polis Kelantan bimbang kes sumbang mahram dan rogol di Kelantan"
+    # Run web search
+    output_path = "output/web_search_results.csv"
+    run_web_search(keywords, output_path, num_results=10, full_claim=full_claim)

ai_api/middleware.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# middleware.py
+import hashlib
+import hmac
+from django.http import JsonResponse
+from ai_api.models import APIClient
+class HMACAuthMiddleware:
+    def __init__(self, get_response):
+        self.get_response = get_response
+    def __call__(self, request):
+        # if request.path.startswith('/admin/'):
+        #     return self.get_response(request)
+        if not request.path.startswith('/api/'):
+            return self.get_response(request)
+        client_id = request.headers.get('X-Client-ID')
+        signature = request.headers.get('X-Signature')
+        if not client_id or not signature:
+            return JsonResponse({'error': 'Missing credentials'}, status=401)
+        from ai_api.models import APIClient
+        try:
+            client = APIClient.objects.get(client_id=client_id)
+        except APIClient.DoesNotExist:
+            return JsonResponse({'error': 'Invalid client ID'}, status=401)
+        expected_signature = hmac.new(
+            client.secret_key.encode(),
+            request.body,
+            hashlib.sha256
+        ).hexdigest()
+        if not hmac.compare_digest(expected_signature, signature):
+            return JsonResponse({'error': 'Invalid signature'}, status=401)
+        request.api_client = client
+        return self.get_response(request)

ai_api/migrations/0001_initial.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by Django 4.2.20 on 2025-05-08 00:50
+from django.db import migrations, models
+class Migration(migrations.Migration):
+    initial = True
+    dependencies = [
+    ]
+    operations = [
+        migrations.CreateModel(
+            name='APIClient',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.CharField(max_length=100, unique=True)),
+                ('client_id', models.CharField(editable=False, max_length=32, unique=True)),
+                ('secret_key', models.CharField(editable=False, max_length=64)),
+                ('created_at', models.DateTimeField(auto_now_add=True)),
+            ],
+        ),
+    ]

ai_api/migrations/__init__.py ADDED Viewed

File without changes

ai_api/models.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from django.db import models
+import secrets
+class APIClient(models.Model):
+    name = models.CharField(max_length=100, unique=True)
+    client_id = models.CharField(max_length=32, unique=True, editable=False)
+    secret_key = models.CharField(max_length=64, editable=False)
+    created_at = models.DateTimeField(auto_now_add=True)
+    def save(self, *args, **kwargs):
+        if not self.client_id:
+            self.client_id = secrets.token_hex(16)
+        if not self.secret_key:
+            self.secret_key = secrets.token_hex(32)
+        super().save(*args, **kwargs)
+    def __str__(self):
+        return self.name

ai_api/request_serializer.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from rest_framework import serializers
+class TranscriptionRequestSerializer(serializers.Serializer):
+    url = serializers.URLField(required=False, allow_null=True)
+    media = serializers.FileField(required=False, allow_null=True)
+    def validate(self, attrs):
+        url = attrs.get('url')
+        media = attrs.get('media')
+        if not url and not media:
+            raise serializers.ValidationError("Either 'url' or 'media' must be provided.")
+        return attrs
+    def validate_media(self, file):
+        if file is None:
+            return file
+        allowed_types = ['audio/', 'video/']
+        content_type = getattr(file, 'content_type', '')
+        if not any(content_type.startswith(t) for t in allowed_types):
+            raise serializers.ValidationError("Only audio or video files are allowed.")
+        return file
+class ClassificationRequestSerializer(serializers.Serializer):
+    claim = serializers.CharField()

ai_api/templates/base-copy.html ADDED Viewed

	@@ -0,0 +1,35 @@

+<!-- templates/base.html -->
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{% block title %}My Django Project{% endblock %}</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
+</head>
+<body>
+    <!-- Navbar (optional) -->
+    <nav class="navbar navbar-expand-lg navbar-light bg-light ps-2">
+        <a class="navbar-brand" href="/">Home</a>
+    </nav>
+    <!-- Main content area -->
+    <div class="container m-2">
+        {% block content %}{% endblock %}
+    </div>
+    <!-- Footer (optional) -->
+    <footer class="bg-light text-center py-3">
+        <p>&copy; 2025 BERNAMA Fact Check</p>
+    </footer>
+    <!-- jQuery Library -->
+    <script src="https://code.jquery.com/jquery-3.6.4.min.js"
+        integrity="sha256-oP6HI9z1XaZNBrJURtCoUT5SUnxFr8s3BzRl+cbzUq8="
+        crossorigin="anonymous"></script>
+    {% block scripts %}{% endblock %}
+</body>
+</html>

ai_api/templates/base.html ADDED Viewed

	@@ -0,0 +1,61 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>{% block title %}BERNAMA Fact Check{% endblock %}</title>
+  {% load static %}
+  <!-- Bootstrap CSS -->
+  <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
+  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css" integrity="sha512-...hash..." crossorigin="anonymous" referrerpolicy="no-referrer" />
+  <link rel="stylesheet" href="{% static 'js/DataTables/datatables.min.css' %}">
+  <link rel="icon" href="{% static 'favicon.ico' %}" type="image/x-icon">
+  <!-- Optional: Custom dark mode toggle -->
+  <style>
+    body.dark-mode {
+      background-color: #121212;
+      color: #f8f9fa;
+    }
+    body.dark-mode .bg-light {
+      background-color: #1f1f1f !important;
+    }
+    body.dark-mode .text-muted {
+      color: #adb5bd !important;
+    }
+  </style>
+</head>
+<body class="dark-mode">
+  <!-- Hero Section -->
+  <section class="py-5 bg-light text-center shadow">
+    <div class="container">
+      <h1 class="display-5 fw-bold mb-3">AI Feature Testing Bed</h1>
+      <p class="lead text-muted mb-4">Experiment with cutting-edge AI modules like Face Recognition and Speech Transcription in one place.</p>
+      <a href="/#features" class="btn btn-primary btn-lg">Explore Features</a>
+    </div>
+  </section>
+  <!-- Main Section -->
+  <section class="py-5">
+    <div class="container">
+      {% block content %}{% endblock %}
+    </div>
+  </section>
+  <!-- Footer -->
+  <footer class="text-center py-4 text-muted">
+    © 2025 BERNAMA Fact Check. All rights reserved.
+  </footer>
+  <!-- Bootstrap JS Bundle (with Popper) -->
+  <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
+  <!-- jQuery -->
+  <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
+  <script src="{% static 'js/DataTables/datatables.min.js' %}"></script>
+  {% block scripts %}{% endblock %}
+</body>
+</html>

ai_api/templates/classification.html ADDED Viewed

	@@ -0,0 +1,142 @@

+{% extends 'base.html' %}
+{% block content %}
+  <div class="container py-4">
+    <h2 class="mb-4 fw-bold text-white">Classification</h2>
+    <form id="classificationForm" method="POST">
+      {% csrf_token %}
+      {{ form.as_p }}
+      <button type="submit" class="btn btn-primary mt-3">
+        Submit
+      </button>
+    </form>
+    <!-- Progress Bar -->
+    <div id="progressContainer" class="mt-4" style="display: none;">
+      <div class="progress">
+        <div id="progressBar" class="progress-bar progress-bar-striped progress-bar-animated" role="progressbar" style="width: 0%"></div>
+      </div>
+      <p id="progressText" class="text-white mt-2"></p>
+    </div>
+    <!-- Results Container -->
+    <div id="resultsContainer" style="display: none;">
+      <div class="alert alert-secondary text-uppercase small mt-4">
+        <p><strong>Category:</strong> <span id="category"></span></p>
+        <p><strong>Keywords:</strong> <span id="keywords"></span></p>
+        <p><strong>Priority Index:</strong> <span id="priorityScore"></span>/10</p>
+      </div>
+      <div class="row g-4 mt-2" id="priorityCards">
+        <!-- Cards will be dynamically inserted here -->
+      </div>
+      <div class="row mt-2 table-responsive" id="sentimentTable">
+        <!-- Sentiment table will be dynamically inserted here -->
+      </div>
+    </div>
+  </div>
+{% endblock %}
+{% block scripts %}
+<script>
+$(document).ready(function(){
+    let progressInterval;
+    $('#classificationForm').on('submit', function(e) {
+        e.preventDefault();
+        // Reset and show progress
+        $('#progressContainer').show();
+        $('#resultsContainer').hide();
+        $('#progressBar').css('width', '0%');
+        $('#progressText').text('Starting...');
+        // Clear any existing interval
+        if (progressInterval) {
+            clearInterval(progressInterval);
+        }
+        // Get form data
+        const formData = new FormData(this);
+        const progressKey = Date.now().toString();
+        formData.append('progress_key', progressKey);
+        // Start progress checking
+        progressInterval = setInterval(() => {
+            $.get(`/progress/${progressKey}/`, function(data) {
+                $('#progressBar').css('width', `${data.percent}%`);
+                $('#progressText').text(`${data.stage}...`);
+                if (data.stage === 'complete') {
+                    clearInterval(progressInterval);
+                }
+            });
+        }, 1000);
+        // Submit form via AJAX
+        $.ajax({
+            url: window.location.pathname,
+            type: 'POST',
+            data: formData,
+            processData: false,
+            contentType: false,
+            success: function(response) {
+                clearInterval(progressInterval);
+                $('#progressContainer').hide();
+                $('#resultsContainer').show();
+                // Update results
+                $('#category').text(response.classification);
+                $('#keywords').text(response.keywords.join(', '));
+                $('#priorityScore').text(response.priority_data.priority_score.toFixed(1));
+                // Update priority cards
+                const priorityFlags = response.priority_data.priority_flags;
+                const cardData = [
+                    { title: 'Does it have fact-check news value?', flag: 'fact_check_value', bg: 'bg-primary' },
+                    { title: 'Could it cause confusion?', flag: 'cause_confusion', bg: 'bg-secondary' },
+                    { title: 'Could it cause chaos?', flag: 'cause_chaos', bg: 'bg-success' },
+                    { title: 'Does it affect government?', flag: 'affects_government', bg: 'bg-danger' },
+                    { title: 'Immediate economic impact?', flag: 'economic_impact', bg: 'bg-warning' },
+                    { title: 'Have laws been broken/bent?', flag: 'law_related', bg: 'bg-info' },
+                    { title: 'Is it in the public interest?', flag: 'public_interest', bg: 'bg-light' },
+                    { title: 'Are lives in danger?', flag: 'lives_in_danger', bg: 'bg-dark' },
+                    { title: 'Is it already viral?', flag: 'viral', bg: 'bg-warning' },
+                    { title: 'Is it urgent or time sensitive?', flag: 'urgent', bg: 'bg-success' }
+                ];
+                let cardsHtml = '';
+                cardData.forEach(card => {
+                    cardsHtml += `
+                        <div class="col-12 col-sm-6 col-md-4 col-lg-3">
+                            <div class="card text-white ${card.bg} h-100 shadow">
+                                <div class="card-body">
+                                    <h5 class="card-title" style="height: 50px;">${card.title}</h5>
+                                    <p class="card-text fs-1">${priorityFlags[card.flag] ? 'Yes' : 'No'}</p>
+                                </div>
+                            </div>
+                        </div>
+                    `;
+                });
+                $('#priorityCards').html(cardsHtml);
+                // Update sentiment table if available
+                if (response.sentiment_data && response.sentiment_data.table_html) {
+                    $('#sentimentTable').html(response.sentiment_data.table_html);
+                    $('#sentimentTable table').DataTable({
+                        responsive: true
+                    });
+                }
+            },
+            error: function(xhr) {
+                clearInterval(progressInterval);
+                $('#progressContainer').hide();
+                alert('Error: ' + (xhr.responseJSON?.error || 'An error occurred'));
+            }
+        });
+    });
+});
+</script>
+{% endblock %}

ai_api/templates/home-copy.html ADDED Viewed

	@@ -0,0 +1,38 @@

+<!-- templates/home.html -->
+{% extends 'base.html' %}
+{% block title %}Welcome to My Homepage{% endblock %}
+{% block content %}
+    <h1>BERNAMA Fact Check Test Bed!</h1>
+    <div class="row col-12 mb-2">
+        <div class="card col-3 m-1">
+            <div class="card-body">
+            <h5 class="card-title">Claim Classification</h5>
+            <p class="card-text">Input a claim and submit for AI to classify the statement.</p>
+            <a href="/classification" class="btn btn-primary">Test Now</a>
+            </div>
+        </div>
+        <div class="card col-3 m-1">
+            <div class="card-body">
+            <h5 class="card-title">Image Profiling</h5>
+            <p class="card-text">Upload an image for AI to analyze.</p>
+            <a href="/image_profiling" class="btn btn-primary">Test Now</a>
+            </div>
+        </div>
+        <div class="card col-3 m-1">
+            <div class="card-body">
+            <h5 class="card-title">Register New Face</h5>
+            <p class="card-text">Insert a person name for AI to learn face recongnition.</p>
+            <a href="/register_face" class="btn btn-primary">Test Now</a>
+            </div>
+        </div>
+        <div class="card col-3 m-1">
+            <div class="card-body">
+            <h5 class="card-title">Transcription</h5>
+            <p class="card-text">Audio/Video to transcription (text)</p>
+            <a href="/transcription" class="btn btn-primary">Test Now</a>
+            </div>
+        </div>
+    </div>
+{% endblock %}

ai_api/templates/home.html ADDED Viewed

	@@ -0,0 +1,60 @@

+{% extends 'base.html' %}
+{% block title %}BERNAMA Fact Check{% endblock %}
+{% block content %}
+<!-- Features Section -->
+<section id="features" class="py-5">
+  <div class="container">
+    <h2 class="text-center fw-bold mb-5 display-6">Core AI Modules</h2>
+    <div class="row g-4">
+      <!-- Feature Card -->
+      <div class="col-12 col-md-6 col-lg-4">
+        <a href="/classification" class="text-decoration-none">
+          <div class="card h-100 shadow-sm hover-shadow transition">
+            <div class="card-body">
+              <h5 class="card-title">Claim Classification</h5>
+              <p class="card-text text-muted">Input a claim and submit for AI to classify the statement.</p>
+            </div>
+          </div>
+        </a>
+      </div>
+      <div class="col-12 col-md-6 col-lg-4">
+        <a href="/transcription" class="text-decoration-none">
+          <div class="card h-100 shadow-sm hover-shadow transition">
+            <div class="card-body">
+              <h5 class="card-title">Transcription</h5>
+              <p class="card-text text-muted">Convert spoken words into text using advanced speech-to-text models.</p>
+            </div>
+          </div>
+        </a>
+      </div>
+      <div class="col-12 col-md-6 col-lg-4">
+        <a href="/image_profiling" class="text-decoration-none">
+          <div class="card h-100 shadow-sm hover-shadow transition">
+            <div class="card-body">
+              <h5 class="card-title">Image Processing</h5>
+              <p class="card-text text-muted">Image profiling; face detection, metadata, captioning etc.</p>
+            </div>
+          </div>
+        </a>
+      </div>
+      <div class="col-12 col-md-6 col-lg-4">
+        <a href="/register_face" class="text-decoration-none">
+          <div class="card h-100 shadow-sm hover-shadow transition">
+            <div class="card-body">
+              <h5 class="card-title">Face Register</h5>
+              <p class="card-text text-muted">Register new face.</p>
+            </div>
+          </div>
+        </a>
+      </div>
+    </div>
+  </div>
+</section>
+{% endblock %}

ai_api/templates/image_profiling.html ADDED Viewed

	@@ -0,0 +1,122 @@

+{% extends 'base.html' %}
+{% block content %}
+<h2 class="mb-4 fw-bold text-white">Image Processing</h2>
+<form class="mb-4" method="POST" enctype="multipart/form-data">
+    {% csrf_token %}
+    {{ form.as_p }}
+    <button type="submit" class="btn btn-primary">Upload Image</button>
+</form>
+{% if proccessed %}
+<div class="mt-4">
+    <div class="nav nav-tabs" id="myTab" role="tablist">
+        <a class="nav-item nav-link active" id="home-tab" data-bs-toggle="tab" href="#home-tab-pane">Uploaded Image</a>
+        <a class="nav-item nav-link" id="profile-tab" data-bs-toggle="tab" href="#profile-tab-pane">Face Detects</a>
+        <a class="nav-item nav-link" id="contact-tab" data-bs-toggle="tab" href="#contact-tab-pane">OCR Texts</a>
+        <a class="nav-item nav-link" id="disabled-tab" data-bs-toggle="tab" href="#disabled-tab-pane">Metadata</a>
+        <a class="nav-item nav-link" id="augmentive-tab" data-bs-toggle="tab" href="#augmentive-tab-pane">Augmentive</a>
+    </div>
+    <div class="tab-content mt-4">
+        <div id="home-tab-pane" class="tab-pane fade show active">
+            <img class="img-fluid mx-auto rounded" src="{{ uploaded_base64 }}" alt="Uploaded Image">
+        </div>
+        <div id="profile-tab-pane" class="tab-pane fade">
+            {% if cropped_faces %}
+            <div class="row g-3">
+                <div class="col-md-5">
+                    <h3 class="mt-4 fw-bold">Detected Faces</h3>
+                    <img class="img-fluid rounded" src="{{ image_with_labels }}" alt="Detected Faces">
+                </div>
+                <div class="col-md-7">
+                    <h3 class="mt-4 fw-bold">Cropped Faces</h3>
+                    <div class="d-flex flex-wrap gap-4">
+                        {% for face, face_name, distance, fdescription in cropped_faces %}
+                        <div class="text-center text-xs" style="width: 80px;">
+                            <img src="{{ face }}" alt="Cropped Face" class="img-thumbnail img-fluid mb-1">
+                            <div style="font-size:10px">
+                                <strong>{{ face_name }}</strong><br>{{ fdescription }}
+                            </div>
+                        </div>
+                        {% endfor %}
+                    </div>
+                </div>
+            </div>
+            {% endif %}
+        </div>
+        <div id="contact-tab-pane" class="tab-pane fade">
+            {% if texts %}
+            <div class="d-flex flex-wrap gap-2">
+                {% for text in texts %}
+                <span class="badge bg-success text-white">{{ text }}</span>
+                {% endfor %}
+            </div>
+            {% endif %}
+        </div>
+        <div id="disabled-tab-pane" class="tab-pane fade">
+            <div class="d-flex flex-wrap gap-4">
+                {% if metadata %}
+                <div class="w-100">
+                    <table class="table table-sm table-striped">
+                        <thead class="table-light">
+                            <tr>
+                                <th>IPTC Field</th>
+                                <th>Value</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            {% for tag, value in metadata.items %}
+                            <tr>
+                                <td>{{ tag }}</td>
+                                <td>{{ value }}</td>
+                            </tr>
+                            {% endfor %}
+                        </tbody>
+                    </table>
+                </div>
+                {% endif %}
+                {% if exifs %}
+                <div class="w-100">
+                    <table class="table table-sm  table-striped">
+                        <thead class="table-light">
+                            <tr>
+                                <th>EXIF Field</th>
+                                <th>Value</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            {% for tag, value in exifs.items %}
+                            <tr>
+                                <td>{{ tag }}</td>
+                                <td>{{ value }}</td>
+                            </tr>
+                            {% endfor %}
+                        </tbody>
+                    </table>
+                </div>
+                {% endif %}
+            </div>
+        </div>
+        <div id="augmentive-tab-pane" class="tab-pane fade">
+            {% if description %}
+            <h3 class="fw-semibold">{{ description }}</h3>
+            {% endif %}
+        </div>
+        <div id="reverse-tab-pane" class="tab-pane fade">
+            {% if reverse_images %}
+            {{ reverse_images }}
+            {% endif %}
+        </div>
+    </div>
+</div>
+{% endif %}
+{% endblock %}

ai_api/templates/register_face.html ADDED Viewed

	@@ -0,0 +1,42 @@

+{% extends 'base.html' %}
+{% block content %}
+<h2 class="mb-4 fw-bold text-white">Face Register</h2>
+<form method="POST" enctype="multipart/form-data" class="mb-4">
+    {% csrf_token %}
+    <div class="row g-4">
+        <div class="col-md-6">
+            <label for="{{ form.person.id_for_label }}" class="form-label">
+                {{ form.person.label }}
+            </label>
+            {{ form.person }}
+        </div>
+        <div class="col-md-6">
+            <label for="{{ form.keywords.id_for_label }}" class="form-label">
+                {{ form.keywords.label }}
+            </label>
+            {{ form.keywords }}
+        </div>
+    </div>
+    <div class="row g-4">
+        <div class="col-md-6">
+            <label for="{{ form.images.id_for_label }}" class="form-label">
+                {{ form.images.label }}
+            </label>
+            {{ form.images }}
+        </div>
+    </div>
+    <button type="submit" class="btn btn-primary mt-2">
+        Register
+    </button>
+</form>
+{% if result %}
+<div class="mt-4 bg-light p-4 rounded shadow-sm">
+    <p class="mb-0">{{ result }}</p>
+</div>
+{% endif %}
+{% endblock %}

ai_api/templates/transcription.html ADDED Viewed

	@@ -0,0 +1,159 @@

+{% extends 'base.html' %}
+{% block content %}
+<h2 class="mb-4 fw-bold text-white">Transcription</h2>
+<form method="post" class="mb-3" id="yt-form" enctype="multipart/form-data">
+    {% csrf_token %}
+    {{ form.as_p }}
+    <input type="hidden" value="{{progress_key}}" name="progress_key">
+    <button type="submit" class="btn btn-primary" id="btnSubmit">
+        Transcribe
+    </button>
+</form>
+<!-- Progress Bar -->
+<div class="progress mb-4 d-none" id="progress-container">
+    <div class="progress-bar progress-bar-striped progress-bar-animated"
+         role="progressbar"
+         aria-valuenow="0"
+         aria-valuemin="0"
+         aria-valuemax="100"
+         style="width: 0%">
+    </div>
+</div>
+<!-- Transcription Result -->
+<div id="transcription" class="d-none">
+    <div class="bg-light p-4 rounded shadow-sm">
+        <div class="container"></div>
+    </div>
+</div>
+{% endblock %}
+{% block scripts %}
+<script src="https://rawcdn.githack.com/mozilla/vtt.js/master/dist/vtt.min.js"></script>
+<script>
+    function copyToClipboard(selector) {
+        const text = $(selector).text(); // Get innerText
+        navigator.clipboard.writeText(text)
+            .then(() => {
+                // console.log('Copied to clipboard:', text);
+            })
+            .catch(err => {
+                console.error('Failed to copy:', err);
+            });
+    }
+    function ucfirst(str) {
+        if (!str) return '';
+        return str.charAt(0).toUpperCase() + str.slice(1);
+    }
+    $(document).ready(function () {
+        $('#yt-form').on('submit', function (e) {
+            e.preventDefault();
+            $('#btnSubmit').text('Downloading...');
+            $('#btnSubmit').prop('disabled', true);
+            $('#progress-container').removeClass('d-none');
+            const $bar = $('.progress-bar');
+            const key = '{{ progress_key }}';
+            var formData = new FormData(this);
+            $.ajax({
+                url: '.',
+                type: 'POST',
+                data: formData,
+                processData: false,
+                contentType: false,
+                success: function (response) {
+                    if (response.segments) {
+                        $('#transcription').removeClass('d-none');
+                        $('#progress-container').removeClass('d-none');
+                        $('#transcription .container').empty(); // Clear previous content
+                        // Insert audio HTML
+                        $('#transcription .container').append(response.audio_file);
+                        // Add subtitle box
+                        const subtitleBox = $('<div id="subtitleBox" style="padding:1em;background:#222;color:white;margin-top:10px;min-height:40px;"></div>')
+                            .text("Play the audio");
+                        $('#transcription .container').append(subtitleBox);
+                        // Get the audio file URL from the HTML string
+                        const audioSrcMatch = response.audio_file.match(/src="([^"]+)"/);
+                        if (!audioSrcMatch) return;
+                        const audioUrl = audioSrcMatch[1]; // /media/uploads/file.wav
+                        const vttUrl = audioUrl.replace('/uploads/', '/vtt/').replace(/\.\w+$/, '.vtt'); // change extension to .vtt
+                        // Load and parse the VTT file using vtt.js
+                        const audio = document.querySelector('#transcription audio');
+                        let cues = [];
+                        fetch(vttUrl)
+                            .then(res => res.text())
+                            .then(vttData => {
+                                const parser = new WebVTT.Parser(window, WebVTT.StringDecoder());
+                                parser.oncue = function (cue) {
+                                    cues.push(cue);
+                                };
+                                parser.parse(vttData);
+                                parser.flush();
+                            });
+                        audio.addEventListener('timeupdate', () => {
+                            const currentTime = audio.currentTime;
+                            const activeCue = cues.find(cue => currentTime >= cue.startTime && currentTime <= cue.endTime);
+                            document.getElementById('subtitleBox').textContent = activeCue ? activeCue.text : '';
+                        });
+                        $('<div class="accordion">\
+                                <div class="accordion-item">\
+                                    <h2 class="accordion-header" id="headingOne">\
+                                        <button class="accordion-button" type="button" data-bs-toggle="collapse" data-bs-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">\
+                                            Full Transcription \
+                                        </button>\
+                                    </h2>\
+                                    <div id="collapseOne" class="accordion-collapse collapse show" aria-labelledby="headingOne" data-bs-parent="#accordionExample">\
+                                        <div class="accordion-body">\
+                                            <div class="float-end"> <a href="'+vttUrl+'" download class="btn btn-sm btn-info me-1" title="Download"> <i class="fa fa-download"></i></a><button class="btn btn-sm me-1 btn-info" title="Copy" onClick="copyToClipboard(\'#segments\')"> <i class="fa fa-clipboard"></i></button></div>\
+                                            <div class="mt-3" id="segments"></div>\
+                                        </div>\
+                                    </div>\
+                                </div>\
+                            </div>').appendTo('#transcription .container');
+                        $.each(response.segments, function(index, segment) {
+                            var pElement = $('<pre></pre>').text(segment.text);
+                            $('#segments').append(pElement);
+                        });
+                    }
+                }
+            });
+            const interval = setInterval(function () {
+                $.getJSON(`/progress/${key}/`, function (data) {
+                    $bar.css('width', data.percent + '%');
+                    $bar.attr('aria-valuenow', data.percent);
+                    // $bar.html(data.percent + '%');
+                    $('#btnSubmit').text(ucfirst(data.stage) + '...');
+                    if (data.stage === 'done') {
+                        $('#btnSubmit').prop('disabled', false).text('Transcribe');
+                        clearInterval(interval);
+                        $('#progress-container').addClass('d-none');
+                    }
+                });
+            }, 1000);
+        });
+    });
+</script>
+{% endblock %}

ai_api/tests.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from django.test import TestCase
2	+
3	+ # Create your tests here.

ai_api/urls.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from django.urls import path
+from . import views
+urlpatterns = [
+    path('',  views.home, name='home'),
+    path('classification/', views.classification, name='classification'),
+    path('image_profiling/', views.image_profiling, name='image_profiling'),
+    path('register_face/', views.register_face, name='register_face'),
+    path('transcription/', views.transcription, name='transcription'),
+    path('progress/<str:key>/', views.check_progress, name='check_progress'),
+]

ai_api/views.py ADDED Viewed

	@@ -0,0 +1,799 @@

+from django.shortcuts import render
+from django.http import JsonResponse
+from .forms import ImageUploadForm, ClassificationForm, RegisterFaceForm,TranscribeForm, YouTubeURLForm
+import shutil
+from django.conf import settings
+import torch
+import json
+import os
+from PIL import Image as PILImage
+import io
+import tempfile
+from django.core.cache import cache
+import numpy as numpy_lib
+import pickle
+from deepface import DeepFace
+import cv2
+import base64
+from io import BytesIO
+from . import globals
+import tempfile
+import mimetypes
+import subprocess
+import logging
+import uuid
+import yt_dlp
+import time
+import re
+from pydub import AudioSegment
+import pandas as pd
+import csv
+# Setup logging for error handling
+logger = logging.getLogger(__name__)
+# from ai_api.library.devlab_image import DevLabImage
+# devlab_image = DevLabImage()
+model = globals.model
+tokenizer = globals.tokenizer
+devlab_image = globals.devlab_image
+with open(f"{globals.save_path}/label_map.json", "r") as f:
+    label_map = json.load(f)
+index_to_label = {v: k for k, v in label_map.items()}
+# Create your views here.
+def home(request):
+    return render(request, 'home.html')
+def classification(request):
+    from .library import simple_keyword_extraction, apify_scraper, priority_indexer, websearch, lowyat_crawler, sentiment_analyzer
+    if request.method == 'POST':
+        progress_key = request.POST.get("progress_key", str(uuid.uuid4()))
+        cache.set(progress_key, {'stage': 'starting', 'percent': 0})
+        text = request.POST.get("claim", "")
+        if not text:
+            return JsonResponse({"error": "No text provided"}, status=400)
+        claim_id = str(uuid.uuid4())[:8]
+        try:
+            # Step 1: Classification
+            cache.set(progress_key, {'stage': 'classifying', 'percent': 10})
+            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
+            with torch.no_grad():
+                outputs = model(**inputs)
+            prediction = torch.argmax(outputs.logits, dim=-1).item()
+            classification_result = index_to_label.get(prediction, "Unknown")
+            # Step 2: Keyword Extraction
+            cache.set(progress_key, {'stage': 'extracting_keywords', 'percent': 20})
+            keywords = simple_keyword_extraction.extract_keywords(text)
+            # Step 3: Setup paths
+            output_path = os.path.join(settings.BASE_DIR, 'ai_api', 'library', 'output')
+            report_path = os.path.join(settings.BASE_DIR, 'ai_api', 'library', 'reports')
+            raw_data_path = os.path.join(output_path, f'{claim_id}.csv')
+            # Step 4: Run TikTok scraper
+            cache.set(progress_key, {'stage': 'scraping_tiktok', 'percent': 30})
+            apify_scraper.run(
+                keywords,
+                output_path=raw_data_path,
+            )
+            # Step 5: Run web search
+            cache.set(progress_key, {'stage': 'searching_web', 'percent': 50})
+            web_search_results = websearch.run(
+                keywords,
+                output_path=os.path.join(output_path, f"{claim_id}_web.json"),
+                full_claim=text
+            )
+            # Step 6: Run Lowyat forum crawler
+            cache.set(progress_key, {'stage': 'crawling_forum', 'percent': 60})
+            lowyat_path = os.path.join(output_path, f"{claim_id}_lowyat.csv")
+            lowyat_sections = ["Kopitiam", "SeriousKopitiam"]
+            lowyat_results = lowyat_crawler.run(
+                keywords,
+                sections=lowyat_sections,
+                output_path=lowyat_path,
+                full_claim=text
+            )
+            # Step 7: Combine datasets
+            cache.set(progress_key, {'stage': 'combining_data', 'percent': 70})
+            if os.path.exists(lowyat_path):
+                lowyat_df = pd.read_csv(lowyat_path)
+                if os.path.exists(raw_data_path):
+                    main_df = pd.read_csv(raw_data_path)
+                    combined_df = pd.concat([main_df, lowyat_df], ignore_index=True)
+                    combined_df.to_csv(raw_data_path, index=False)
+                else:
+                    lowyat_df.to_csv(raw_data_path, index=False)
+            # Step 8: Run sentiment analysis
+            cache.set(progress_key, {'stage': 'analyzing_sentiment', 'percent': 80})
+            sentiment_csv = os.path.join(output_path, f"{claim_id}_sentiment.csv")
+            sentiment_data = {}
+            if os.path.exists(raw_data_path):
+                sentiment_analyzer.run(raw_data_path, sentiment_csv)
+                if os.path.exists(sentiment_csv):
+                    sentiment_df = pd.read_csv(sentiment_csv)
+                    sentiment_counts = sentiment_df['sentiment'].value_counts().to_dict()
+                    sentiment_map = {0: "neutral", 1: "positive", 2: "negative"}
+                    text_counts = {sentiment_map.get(k, k): v for k, v in sentiment_counts.items()}
+                    sentiment_data = {
+                        'counts': text_counts,
+                        'table_html': csv_to_html_table(sentiment_csv)
+                    }
+            # Step 9: Run priority indexing
+            cache.set(progress_key, {'stage': 'indexing_priority', 'percent': 90})
+            priority_json = os.path.join(report_path, f"{claim_id}_priority.json")
+            priority_data = {}
+            if os.path.exists(sentiment_csv):
+                priority_indexer.run(
+                    claim=text,
+                    claim_id=claim_id,
+                    keywords=keywords,
+                    sentiment_csv=sentiment_csv,
+                    output_path=priority_json
+                )
+                if os.path.exists(priority_json):
+                    with open(priority_json, 'r') as f:
+                        priority_data = json.load(f)
+                        verdict = determine_verdict(priority_data)
+            # Step 10: Complete
+            cache.set(progress_key, {'stage': 'complete', 'percent': 100})
+            return JsonResponse({
+                'classification': classification_result,
+                'keywords': keywords,
+                'sentiment_data': sentiment_data,
+                'priority_data': priority_data,
+                'verdict': verdict if 'verdict' in locals() else "UNVERIFIED",
+                'progress_key': progress_key
+            })
+        except Exception as e:
+            logger.error(f"Error in classification: {str(e)}")
+            return JsonResponse({
+                'error': str(e),
+                'progress_key': progress_key
+            }, status=500)
+    else:
+        form = ClassificationForm()
+        return render(request, 'classification.html', {
+            'form': form,
+            'result': {}
+        })
+def determine_verdict(priority_data):
+    """Determine verdict based on priority data"""
+    # Extract priority flags from the data
+    if isinstance(priority_data, dict):
+        if "priority_flags" in priority_data:
+            priority_flags = priority_data["priority_flags"]
+        else:
+            # Assume the dictionary itself contains the flags
+            priority_flags = priority_data
+    else:
+        return "UNVERIFIED"
+    # Get sentiment counts if available
+    sentiment_counts = {}
+    if "sentiment_counts" in priority_data:
+        sentiment_counts = priority_data["sentiment_counts"]
+        # Convert keys to strings if they're not already
+        if any(not isinstance(k, str) for k in sentiment_counts.keys()):
+            sentiment_counts = {str(k): v for k, v in sentiment_counts.items()}
+    # Get priority score if available
+    priority_score = priority_data.get("priority_score", sum(priority_flags.values()))
+    # Get claim and keywords
+    claim = priority_data.get("claim", "").lower()
+    keywords = priority_data.get("keywords", [])
+    keywords_lower = [k.lower() for k in keywords]
+    # Check for specific claim patterns
+    is_azan_claim = any(word in claim for word in ["azan", "larang", "masjid", "pembesar suara"])
+    is_religious_claim = any(word in claim for word in ["islam", "agama", "masjid", "surau", "sembahyang", "solat", "zakat"])
+    # Check for economic impact
+    economic_related = priority_flags.get("economic_impact", 0) == 1
+    # Check for government involvement
+    government_related = priority_flags.get("affects_government", 0) == 1
+    # Check for law-related content
+    law_related = priority_flags.get("law_related", 0) == 1
+    # Check for confusion potential
+    causes_confusion = priority_flags.get("cause_confusion", 0) == 1
+    # Check for negative sentiment dominance
+    negative_dominant = False
+    if sentiment_counts:
+        pos = int(sentiment_counts.get("positive", sentiment_counts.get("1", 0)))
+        neg = int(sentiment_counts.get("negative", sentiment_counts.get("2", 0)))
+        neu = int(sentiment_counts.get("neutral", sentiment_counts.get("0", 0)))
+        negative_dominant = neg > pos and neg > neu
+    # Special case for azan claim (like the example provided)
+    if is_azan_claim and is_religious_claim and "larangan" in claim:
+        return "FALSE"  # Claim about banning azan is false
+    # Determine verdict based on multiple factors
+    if priority_score >= 7.0 and negative_dominant and (government_related or law_related):
+        return "FALSE"
+    elif priority_score >= 5.0 and causes_confusion:
+        return "PARTIALLY_TRUE"
+    elif priority_score <= 3.0 and not negative_dominant:
+        return "TRUE"
+    elif economic_related and government_related:
+        # Special case for economic policies by government
+        if negative_dominant:
+            return "FALSE"
+        elif causes_confusion:
+            return "PARTIALLY_TRUE"
+        else:
+            return "TRUE"
+    else:
+        return "UNVERIFIED"
+def image_profiling(request):
+    # import faiss
+    result = None
+    image_with_labels = None
+    cropped_faces_base64 = []
+    texts = None
+    proccessed = False
+    uploded_base64 = None
+    exifs = None
+    metadata = None
+    description = None
+    reverse_images = None
+    if request.method == 'POST':
+        form = ImageUploadForm(request.POST, request.FILES)
+        if form.is_valid():
+            proccessed = True
+            uploaded_image = request.FILES['image']
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+                for chunk in uploaded_image.chunks():
+                    tmp.write(chunk)
+                tmp_path = tmp.name
+            image = PILImage.open(uploaded_image)
+            image_np = numpy_lib.array(image.convert('RGB'))
+            exifs = devlab_image.extract_exif(tmp_path)
+            metadata = devlab_image.extract_metadata_exiftool(tmp_path)
+            description = devlab_image.generate_description_blip(tmp_path)
+            # reverse_images = devlab_image.reverse_search(tmp_path)
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")  # or "JPEG", depending on your image format
+            img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            uploded_base64 = f"data:image/png;base64,{img_str}"
+            texts = devlab_image.extract_text_numpy(image_np)
+            # Detect face embeddings using DeepFace
+            face_embeddings = DeepFace.represent(image_np, model_name="Facenet", enforce_detection=False)
+            if not face_embeddings:
+                return "❌ No faces detected in the image."
+            recognized_faces = {}
+            cropped_faces = []
+            for face_data in face_embeddings:
+                query_embedding = numpy_lib.array(face_data["embedding"], dtype=numpy_lib.float32).reshape(1, -1)
+                results = devlab_image.query_embedding(query_embedding,1)
+                if results and len(results) > 0 and len(results[0]) > 0:
+                    entity = results[0][0].entity
+                    print(f"Entity: {entity}")  # See what fields are present in the entity
+                    face_name = entity.get('name') if entity else 'Unknown'
+                    fdescription = entity.get('short_description') if entity else ''
+                    if fdescription is None:
+                        fdescription = ''
+                    distance = round(results[0][0].distance, 4)
+                    if distance*100>95:
+                        face_name = f"{face_name} (CLOSEST)"
+                    # Store recognized face data
+                    recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {
+                        "name": face_name,
+                        "distance": distance,
+                        "description": fdescription,
+                    }
+                    # Face location for drawing rectangle and adding label
+                    face_location = face_data["facial_area"]
+                    x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
+                    # Draw rectangle and label on the image
+                    # cv2.putText(image_np, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
+                    cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 255, 0), 2)
+                    # Crop the detected face and prepare it for displaying
+                    cropped_face = image_np[y:y + h, x:x + w]
+                    cropped_faces.append([cropped_face, face_name, distance, fdescription])
+                    # label = f"{face_name} (Dist: {round(distance, 2)})"
+                else:
+                    print('No result found')
+            # Convert the image with labels to base64 for HTML rendering
+            _, buffer = cv2.imencode('.png', image_np)
+            image_base64 = base64.b64encode(buffer).decode('utf-8')
+            # Convert cropped faces to base64 for displaying in template
+            cropped_faces_base64 = []
+            for face, face_name, distance, fdescription in cropped_faces:
+                _, buffer = cv2.imencode('.png', face)
+                face_base64 = base64.b64encode(buffer).decode('utf-8')
+                cropped_faces_base64.append([f"data:image/png;base64,{face_base64}",face_name, distance, fdescription])
+            # Prepare result for template rendering
+            result = recognized_faces
+            image_with_labels = f"data:image/png;base64,{image_base64}"
+    else:
+        form = ImageUploadForm()
+    return render(request, 'image_profiling.html', {
+        'form': form,
+        'proccessed' : proccessed,
+        'uploaded_base64': uploded_base64,
+        'image_with_labels': image_with_labels,
+        'cropped_faces': cropped_faces_base64,
+        'texts': texts,
+        'exifs': exifs,
+        'metadata': metadata,
+        'description': description,
+        'reverse_images': reverse_images
+    })
+# def detect_faces2(request):
+    # import faiss
+    # import numpy as np
+    # import pickle
+    # from deepface import DeepFace
+    # import cv2
+    # import base64
+    # from io import BytesIO
+    # from PIL import Image
+    # import os
+    # result = None
+    # image_with_labels = None
+    # cropped_faces_base64 = []
+    # if request.method == 'POST':
+    #     form = ImageUploadForm(request.POST, request.FILES)
+    #     if form.is_valid():
+    #         uploaded_image = request.FILES['image']
+    #         # Open the uploaded image with Pillow and convert to RGB
+    #         image = Image.open(uploaded_image).convert('RGB')
+    #         image_np = numpy_lib.array(image)
+    #         # Load FAISS index and metadata
+    #         save_path = os.path.join(os.path.dirname(__file__), "deepface")
+    #         try:
+    #             index = faiss.read_index(save_path + "/faiss_hnsw_index.bin")
+    #             with open(save_path + "/metadata.pkl", "rb") as f:
+    #                 names = pickle.load(f)
+    #         except Exception as e:
+    #             return f"Error loading FAISS index or metadata: {str(e)}"
+    #         # Set search parameters for better accuracy in FAISS
+    #         index.hnsw.efSearch = 100  # Larger = better accuracy, but slower
+    #         # Detect face embeddings using DeepFace
+    #         face_embeddings = DeepFace.represent(image_np, model_name="Facenet", enforce_detection=False)
+    #         if not face_embeddings:
+    #             return "❌ No faces detected in the image."
+    #         recognized_faces = {}
+    #         cropped_faces = []
+    #         for face_data in face_embeddings:
+    #             query_embedding = numpy_lib.array(face_data["embedding"], dtype=numpy_lib.float32).reshape(1, -1)
+    #             # Search for the closest matches in the FAISS index
+    #             D, I = index.search(query_embedding, 1)  # D = distances, I = indices
+    #             # Get the top match for this face
+    #             face_name = names[I[0][0]]
+    #             distance = D[0][0]
+    #             # Store recognized face data
+    #             recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {
+    #                 "name": face_name,
+    #                 "distance": round(distance, 4)
+    #             }
+    #             # Face location for drawing rectangle and adding label
+    #             face_location = face_data["facial_area"]
+    #             x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
+    #             # Draw rectangle and label on the image
+    #             # cv2.putText(image_np, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
+    #             cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 255, 0), 2)
+    #             # Crop the detected face and prepare it for displaying
+    #             cropped_face = image_np[y:y + h, x:x + w]
+    #             cropped_faces.append([cropped_face, face_name])
+    #             label = f"{face_name} (Dist: {round(distance, 4)})"
+    #         # Convert the image with labels to base64 for HTML rendering
+    #         _, buffer = cv2.imencode('.png', image_np)
+    #         image_base64 = base64.b64encode(buffer).decode('utf-8')
+    #         # Convert cropped faces to base64 for displaying in template
+    #         cropped_faces_base64 = []
+    #         for face,fname in cropped_faces:
+    #             _, buffer = cv2.imencode('.png', face)
+    #             face_base64 = base64.b64encode(buffer).decode('utf-8')
+    #             cropped_faces_base64.append([f"data:image/png;base64,{face_base64}",fname])
+    #         # Prepare result for template rendering
+    #         result = recognized_faces
+    #         image_with_labels = f"data:image/png;base64,{image_base64}"
+    # else:
+    #     form = ImageUploadForm()
+    # return render(request, 'face_detection.html', {
+    #     'form': form,
+    #     'result': result,
+    #     'image_with_labels': image_with_labels,
+    #     'cropped_faces': cropped_faces_base64  # Pass the list of cropped faces to the template
+    # })
+def register_face(request):
+    from ai_api.library.devlab_image import DevLabImage
+    import os
+    from django.core.files.storage import FileSystemStorage
+    from django.conf import settings
+    result = None
+    if request.method == 'POST':
+        form = RegisterFaceForm(request.POST)
+        person = request.POST.get("person", "").upper()
+        keywords = request.POST.get("keywords", "")
+        files = request.FILES.getlist('images')
+        devlab_image = DevLabImage()
+        if files:
+            print('Upload manual')
+            project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            upload_dir = os.path.join(project_root, 'people', person)
+            print(f"Saving to: {upload_dir}")
+            os.makedirs(upload_dir, exist_ok=True)
+            fs = FileSystemStorage(location=upload_dir)
+            for file in files:
+                filename = fs.save(file.name, file)
+                file_url = fs.url(filename)
+                print(f"Saved: {file_url}")
+                devlab_image.extract_face( person, keywords)
+        else:
+            print('Download from Google')
+            devlab_image.register_person(person, keywords)
+    else:
+        form = RegisterFaceForm()
+    return render(request, 'register_face.html', {
+        'form': form,
+        'result': result,
+    })
+def check_progress(request, key):
+    # print(f"getting progress key {key}")
+    progress = cache.get(key, {'stage': 'downloading', 'percent': 0})
+    # print(progress)
+    return JsonResponse(progress)
+def handle_uploaded_file(file):
+    mime_type, _ = mimetypes.guess_type(file.name)
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
+        output_audio_file = temp_audio_file.name
+    if mime_type and mime_type.startswith('video'):
+        # Save video temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[-1]) as temp_video_file:
+            for chunk in file.chunks():
+                temp_video_file.write(chunk)
+            video_path = temp_video_file.name
+        # Extract audio using ffmpeg
+        command = [
+            'ffmpeg',
+            '-y',
+            '-i', video_path,
+            '-vn',  # no video
+            '-acodec', 'pcm_s16le',  # WAV format
+            '-ar', '16000',          # 16 kHz sample rate
+            '-ac', '1',              # Mono channel
+            output_audio_file
+        ]
+        try:
+            result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+            print("FFmpeg stderr:", result.stderr.decode())
+        except subprocess.CalledProcessError as e:
+            logger.error(f"ffmpeg failed with error: {e.stderr.decode()}")
+            raise Exception(f"Audio extraction failed: {e.stderr.decode()}")
+        # Clean up temporary video file
+        os.remove(video_path)
+    else:
+        # If audio, save it directly
+        with open(output_audio_file, 'wb') as f:
+            for chunk in file.chunks():
+                f.write(chunk)
+    return output_audio_file
+def format_time(seconds):
+    # Convert seconds to WebVTT time format (hh:mm:ss.mmm)
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    ms = int((s - int(s)) * 1000)  # Milliseconds
+    return f"{int(h):02}:{int(m):02}:{int(s):02}.{ms:03}"
+def generate_vtt(segments):
+    # Generate the VTT content from the Whisper segments
+    vtt_content = "WEBVTT\n\n"
+    for segment in segments:
+        start_time = segment['start']
+        end_time = segment['end']
+        text = segment['text']
+        # Convert seconds to WebVTT time format
+        start_time_str = format_time(start_time)
+        end_time_str = format_time(end_time)
+        vtt_content += f"{start_time_str} --> {end_time_str}\n{text}\n\n"
+    return vtt_content
+def save_vtt(output_audio_file, vtt):
+    base_name = os.path.splitext(os.path.basename(output_audio_file))[0]
+    new_filename = base_name + ".vtt"
+    final_path = os.path.join(settings.MEDIA_ROOT, 'vtt', new_filename)
+    os.makedirs(os.path.dirname(final_path), exist_ok=True)
+    with open(final_path, "w", encoding="utf-8") as f:
+        f.write(vtt)
+    return final_path
+def transcription(request):
+    transcription = None
+    error = None
+    progress_key = str(uuid.uuid4())
+    if request.method == "POST":
+        progress_key = request.POST.get("progress_key", progress_key)
+        model = globals.whisper_model
+        form = YouTubeURLForm(request.POST)
+         #if form.is_valid():
+        file = request.FILES.get('file')
+        if file:
+            # with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
+            #     for chunk in file.chunks():
+            #         temp_file.write(chunk)
+            #     output_audio_file = temp_file.name
+            output_audio_file = handle_uploaded_file(file)
+            if os.path.getsize(output_audio_file) == 0:
+                raise RuntimeError("FFmpeg produced an empty audio file.")
+            print(f"transcribing : {output_audio_file}")
+            cache.set(progress_key, {'stage': 'transcribing', 'percent': 100})
+            result = model.transcribe(output_audio_file,verbose=False)
+            vtt = generate_vtt(result['segments'])
+            vtt_file = save_vtt(output_audio_file, vtt)
+        else:
+            cache.set(progress_key, {'stage': 'downloading', 'percent': 0})
+            ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+            def progress_hook(d):
+                # print(f"status {d['status']}")
+                if d['status'] == 'downloading':
+                    # print(d)
+                    percent_str = d.get('_percent_str', '0%').strip()
+                    clean_str = ansi_escape.sub('', percent_str).strip()
+                    # print(f"clean percent_str: {repr(clean_str)}")  # e.g. '100.0%'
+                    try:
+                        match = re.search(r'(\d+(?:\.\d+)?)', clean_str)
+                        if match:
+                            percent = float(match.group(1))
+                        else:
+                            print("❌ Regex didn't match!")
+                            percent = 0
+                    except Exception as e:
+                        print(f"❌ Error parsing percent: {e}")
+                        percent = 0
+                    # print(f"✅ current progress for {progress_key} is: {percent}")
+                    cache.set(progress_key, {'stage': 'downloading', 'percent': percent})
+            url = request.POST.get('url')
+            unique_id = str(uuid.uuid4())
+            temp_dir = tempfile.gettempdir()
+            base_filename = f"temp_{unique_id}"
+            download_path = f"{temp_dir}/{base_filename}.%(ext)s"
+            # print(f"download_path: {download_path}")
+            output_audio_file = f"{temp_dir}/{base_filename}.mp3"
+            ydl_opts = {
+                'format': 'bestaudio/best',
+                'outtmpl': download_path,  # No fixed extension!
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'mp3',
+                    'preferredquality': '192',
+                }],
+                'progress_hooks': [progress_hook],
+                'quiet': True,
+                'no_warnings': True,
+                'noplaylist': True,
+            }
+            print(f"downloading : {url}")
+            try:
+                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                    ydl.download([url])
+                    print(f"transcribing : {output_audio_file}")
+                    cache.set(progress_key, {'stage': 'transcribing', 'percent': 100})
+                    result = model.transcribe(output_audio_file,verbose=False)
+                    vtt = generate_vtt(result['segments'])
+                    vtt_file = save_vtt(output_audio_file,vtt)
+            except Exception as e:
+                error = str(e)
+        # transcription = result['text']
+        # audio = AudioSegment.from_file(output_audio_file)
+        # chunk_length_ms = 60 * 1000  # 1-minute chunks
+        # chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+        # results = []
+        # total_chunks = len(chunks)
+        # cache.set(progress_key, {'stage': 'transcribing', 'percent': 0})
+        # for i, chunk in enumerate(chunks):
+        #     temp_filename = f"temp_chunk_{i}.wav"
+        #     chunk.export(temp_filename, format="wav")
+        #     result = model.transcribe(temp_filename, verbose=False)
+        #     results.append(result["text"])
+        #     os.remove(temp_filename)
+        #     # Update progress
+        #     percent = int((i + 1) / total_chunks * 100)
+        #     cache.set(progress_key, {'stage': 'transcribing', 'percent': percent})
+        # # Combine all chunk texts
+        # transcription = "\n".join(results)
+        cache.set(progress_key, {'stage': 'done', 'percent': 100})
+        filename = os.path.basename(output_audio_file)
+        final_path = os.path.join(settings.MEDIA_ROOT, 'uploads', filename)
+        os.makedirs(os.path.dirname(final_path), exist_ok=True)
+        shutil.move(output_audio_file, final_path)
+        # Public URL
+        file_url = settings.MEDIA_URL + 'uploads/' + filename
+        audio_html = f'<audio controls><source src="{file_url}" type="audio/wav">Your browser does not support the audio element.</audio>'
+        return JsonResponse({'text': result['text'], 'segments': result['segments'], 'audio_file': audio_html  })
+        # if os.path.exists(output_audio_file):
+        #     os.remove(output_audio_file)
+            # return render(request, 'transcription.html', {
+            #     'form': form,
+            #     'transcription': transcription,
+            #     'error': error,
+            #     'progress_key': progress_key,
+            # })
+    else:
+        form = TranscribeForm()
+        return render(request, 'transcription.html', {
+            'form': form,
+            'transcription': transcription,
+            'error': error,
+            'progress_key': progress_key,
+        })
+def csv_to_html_table(filepath):
+    def is_valid_url(url):
+        # URL pattern matching - must start with http:// or https://
+        url_pattern = re.compile(
+            r'^https?://'  # must start with http:// or https://
+            r'([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+'  # domain
+            r'[a-zA-Z]{2,}'  # TLD
+            r'(/[a-zA-Z0-9-._~:/?#[\]@!$&\'()*+,;=]*)?$'  # path and query
+        )
+        return bool(url_pattern.match(url))
+    html = '<table id="dataset" class="table table-bordered mt-2 smaller">'
+    with open(filepath, newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        for i, row in enumerate(reader):
+            if i == 0:
+                html += '<thead>'
+                html += "<tr>" + "".join(f"<th>{col}</th>" for col in row) + "</tr>"
+                html += '</thead>'
+            else:
+                html += "<tr>" + "".join(
+                    f'<td><a href="{col}" target="_blank" rel="noopener noreferrer">{col}</a></td>' if is_valid_url(col) else f"<td>{col}</td>"
+                    for col in row
+                ) + "</tr>"
+    html += "</table>"
+    return html

ai_api/widgets.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from django.forms.widgets import ClearableFileInput
+class MultipleFileInput(ClearableFileInput):
+    allow_multiple_selected = True

csv_people.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+import csv
+# Path to the folder you want to scan
+folder_path = 'people'
+# Get all subfolder names
+subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
+# Path to the output CSV file
+csv_file = 'subfolders.csv'
+# Write the subfolder names to the CSV file
+with open(csv_file, mode='w', newline='') as file:
+    writer = csv.writer(file)
+    writer.writerow(['Subfolder Name'])  # Write the header
+    for subfolder in subfolders:
+        writer.writerow([subfolder])  # Write each subfolder name
+print(f"Subfolder names have been written to {csv_file}")

delete_milvus.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pymilvus import Collection, connections
+from dotenv import load_dotenv
+import os
+load_dotenv()
+milvus_host = os.getenv("MILVUS_HOST", "localhost")  # default localhost
+milvus_port = os.getenv("MILVUS_PORT", "19530")       # default 19530
+connections.connect("default", host=milvus_host, port=int(milvus_port))
+# Now, connect to the collection
+collection = Collection("faces")
+# Query the collection to find entries where the 'name' field is empty or None
+query = 'name == "YAB DATO SERI ANWAR IBRAHIM"'  # Looking for entities where 'name' is empty
+# Perform the query to find entities with empty 'name' fields
+results = collection.query(query, output_fields=["id", "name"])
+# Check and delete entities with empty 'name'
+if results:
+    ids_to_delete = [str(result["id"]) for result in results]
+    id_expr = f"id in [{', '.join(ids_to_delete)}]"
+    collection.delete(expr=id_expr)
+    print(f"✅ Deleted entities: {ids_to_delete}")
+else:
+    print("❌ No entities found for deletion.")

devlab_next/.gitignore ADDED Viewed

	@@ -0,0 +1,68 @@

+# Python bytecode files
+*.pyc
+*.pyo
+*.pyd
+__pycache__/
+# Virtual environment
+venv/
+env/
+# Distribution / packaging
+*.egg
+*.egg-info
+dist/
+build/
+*.whl
+# IDE files
+.idea/
+.vscode/
+# Jupyter Notebook files
+.ipynb_checkpoints
+# PyInstaller
+*.manifest
+*.spec
+# Test and coverage reports
+.coverage
+*.coveragerc
+nosetests.xml
+coverage.xml
+*.coveralls.yml
+# MyPy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pytest
+.cache/
+# Sphinx documentation
+docs/_build/
+# pytest and flake8
+*.log
+# VS Code settings
+.vscode/
+# Django secrets
+*.env
+# Flask instance folder
+instance/
+# PyCharm project files
+.idea/
+# Other Python-related files
+*.bak
+*.swp
+*.swo
+ddet_classification/
+.DS_Store
+.pkl

devlab_next/__init__.py ADDED Viewed

File without changes

devlab_next/asgi.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+ASGI config for devlab_next project.
+It exposes the ASGI callable as a module-level variable named ``application``.
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
+"""
+import os
+from django.core.asgi import get_asgi_application
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'devlab_next.settings')
+application = get_asgi_application()

devlab_next/settings.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Django settings for devlab_next project.
+Generated by 'django-admin startproject' using Django 4.2.7.
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/topics/settings/
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/4.2/ref/settings/
+"""
+from pathlib import Path
+import os
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = 'django-insecure-5a87e9*^s30hb+%+h@t^06493w2tpv7w6%+(0!#iu77b%*8=#i'
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+ALLOWED_HOSTS = ['127.0.0.1','fctestbed.bernama.com','localhost']
+# Application definition
+INSTALLED_APPS = [
+    'django.contrib.admin',
+    'django.contrib.auth',
+    'django.contrib.contenttypes',
+    'django.contrib.sessions',
+    'django.contrib.messages',
+    'django.contrib.staticfiles',
+    'rest_framework',
+    # 'ai_api',
+    'ai_api.apps.AiApiConfig',
+]
+MIDDLEWARE = [
+    'django.middleware.security.SecurityMiddleware',
+    'django.contrib.sessions.middleware.SessionMiddleware',
+    'django.middleware.common.CommonMiddleware',
+    'django.middleware.csrf.CsrfViewMiddleware',
+    'django.contrib.auth.middleware.AuthenticationMiddleware',
+    'django.contrib.messages.middleware.MessageMiddleware',
+    'django.middleware.clickjacking.XFrameOptionsMiddleware',
+    # 'ai_api.middleware.HMACAuthMiddleware'
+]
+ROOT_URLCONF = 'devlab_next.urls'
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [],
+        'APP_DIRS': True,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.debug',
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
+WSGI_APPLICATION = 'devlab_next.wsgi.application'
+# Database
+# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
+# DATABASES = {
+#     'default': {
+#         'ENGINE': 'django.db.backends.sqlite3',
+#         'NAME': BASE_DIR / 'db.sqlite3',
+#     }
+# }
+DATABASES = {
+    "default": {
+        "ENGINE": "django.db.backends.postgresql",
+        "NAME": os.environ.get("DB_NAME", "factcheckapidb"),
+        "USER": os.environ.get("DB_USER", "postgres"),
+        "PASSWORD": os.environ.get("DB_PASSWORD", "postgres"),
+        "HOST": os.environ.get("DB_HOST", "127.0.0.1"),
+        "PORT": os.environ.get("DB_PORT", "5432"),
+    }
+}
+# Password validation
+# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+    },
+]
+# Internationalization
+# https://docs.djangoproject.com/en/4.2/topics/i18n/
+LANGUAGE_CODE = 'en-us'
+TIME_ZONE = 'UTC'
+USE_I18N = True
+USE_TZ = True
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/4.2/howto/static-files/
+STATIC_URL = '/static/'
+# STATIC_ROOT = BASE_DIR / 'static/'
+STATICFILES_DIRS = [
+    os.path.join(BASE_DIR, 'static'),
+]
+# Default primary key field type
+# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
+MEDIA_URL = '/media/'
+MEDIA_ROOT = BASE_DIR / 'media'
+CACHES = {
+    'default': {
+        'BACKEND': 'django.core.cache.backends.locmem.LocMemCache',  # In-memory
+        'LOCATION': 'progress-cache',
+    }
+}

devlab_next/urls.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+URL configuration for devlab_next project.
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/4.2/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.contrib import admin
+from django.urls import path, include
+from django.conf import settings
+from django.conf.urls.static import static
+import os
+admin.site.site_header = "BERNAMA Fact Check"
+admin.site.site_title = "BERNAMA Fact Check Portal"
+admin.site.index_title = "Dashboard"
+urlpatterns = [
+    path('admin/', admin.site.urls),
+    path('', include('ai_api.urls')),
+    path('api/v1/', include('ai_api.api_urls')),
+]+ static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
+urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)

devlab_next/wsgi.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+WSGI config for devlab_next project.
+It exposes the WSGI callable as a module-level variable named ``application``.
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
+"""
+import os
+from django.core.wsgi import get_wsgi_application
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'devlab_next.settings')
+application = get_wsgi_application()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,95 @@

+version: '3.5'
+services:
+  web:
+    build: .
+    container_name: django_app
+    mem_limit: 16g
+    command: gunicorn devlab_next.wsgi:application --bind 0.0.0.0:8000 --workers 3 --log-level debug
+    volumes:
+      - .:/app
+    ports:
+      - "8000:8000"
+    depends_on:
+      - milvus-standalone
+    environment:
+      - DJANGO_SETTINGS_MODULE=devlab_next.settings
+      - TF_CPP_MIN_LOG_LEVEL=2
+    networks:
+      - milvus_network
+  milvus-standalone:
+    container_name: milvus
+    image: milvusdb/milvus:v2.5.8
+    command: ["milvus", "run", "standalone"]
+    security_opt:
+      - seccomp:unconfined
+    restart: always
+    ports:
+      - "19530:19530" # gRPC
+      - "19121:19121" # HTTP (correct health port)
+    volumes:
+      - ./volumes/milvus:/var/lib/milvus
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:19121/healthz"]
+      interval: 30s
+      start_period: 90s
+      timeout: 20s
+      retries: 3
+    depends_on:
+      - etcd
+      - minio
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+      MILVUS_LOG_LEVEL: debug
+    networks:
+      - milvus_network
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.18
+    container_name: etcd
+    command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ./volumes/etcd:/etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    ports:
+      - "2379:2379"
+      - "2380:2380"
+    networks:
+      - milvus_network
+  minio:
+    container_name: minio
+    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    command: minio server /minio_data --console-address ":9001"
+    ports:
+      - "9000:9000"
+      - "9001:9001"
+    volumes:
+      - ./volumes/minio:/minio_data
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    networks:
+      - milvus_network
+networks:
+  milvus_network:
+    driver: bridge

download_people.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from ai_api.library.devlab_image import DevLabImage
+import csv
+devlab_image = DevLabImage()
+# # Open and read the CSV file
+with open("subfolders.csv", mode="r", encoding="utf-8") as file:
+    reader = csv.reader(file)
+    for row in reader:
+        print(row[0], row[1])  # Each row is a list
+        devlab_image.register_person(row[0],row[1])
+# field_value = input("Enter the name: ")
+# devlab_image.download_person_images(field_value.upper())

list_faces.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pymilvus import Collection, connections
+from dotenv import load_dotenv
+import os
+load_dotenv()
+milvus_host = os.getenv("MILVUS_HOST", "localhost")  # default localhost
+milvus_port = os.getenv("MILVUS_PORT", "19530")       # default 19530
+connections.connect("default", host=milvus_host, port=int(milvus_port))
+# Now, connect to the collection
+collection = Collection("faces")
+# Query expression that retrieves all documents with a non-null 'id' (or use any valid field)
+query = "id IS NOT NULL"  # Valid query expression to fetch all documents
+# Retrieve all documents, adjust fields based on your collection schema
+results = collection.query(query, output_fields=["id", "name"])
+# Print all results
+for result in results:
+    print(f"ID: {result['id']}, Name: {result.get('name', 'N/A')}")