xspinners commited on
Commit
090987a
·
0 Parent(s):
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +9 -0
  2. .gitignore +82 -0
  3. Dockerfile +32 -0
  4. ai_api/.gitignore +68 -0
  5. ai_api/__init__.py +0 -0
  6. ai_api/admin.py +10 -0
  7. ai_api/api.py +44 -0
  8. ai_api/api_urls.py +10 -0
  9. ai_api/apps.py +63 -0
  10. ai_api/controllers/__init__.py +2 -0
  11. ai_api/controllers/classification.py +15 -0
  12. ai_api/controllers/transcription.py +16 -0
  13. ai_api/forms.py +86 -0
  14. ai_api/globals.py +6 -0
  15. ai_api/library/apify_scraper.py +893 -0
  16. ai_api/library/config.py +131 -0
  17. ai_api/library/devlab_image.py +487 -0
  18. ai_api/library/lowyat_crawler.py +714 -0
  19. ai_api/library/priority_indexer.py +360 -0
  20. ai_api/library/sentiment_analyzer.py +91 -0
  21. ai_api/library/simple_keyword_extraction.py +205 -0
  22. ai_api/library/websearch.py +237 -0
  23. ai_api/middleware.py +40 -0
  24. ai_api/migrations/0001_initial.py +24 -0
  25. ai_api/migrations/__init__.py +0 -0
  26. ai_api/models.py +18 -0
  27. ai_api/request_serializer.py +30 -0
  28. ai_api/templates/base-copy.html +35 -0
  29. ai_api/templates/base.html +61 -0
  30. ai_api/templates/classification.html +142 -0
  31. ai_api/templates/home-copy.html +38 -0
  32. ai_api/templates/home.html +60 -0
  33. ai_api/templates/image_profiling.html +122 -0
  34. ai_api/templates/register_face.html +42 -0
  35. ai_api/templates/transcription.html +159 -0
  36. ai_api/tests.py +3 -0
  37. ai_api/urls.py +12 -0
  38. ai_api/views.py +799 -0
  39. ai_api/widgets.py +5 -0
  40. csv_people.py +20 -0
  41. delete_milvus.py +29 -0
  42. devlab_next/.gitignore +68 -0
  43. devlab_next/__init__.py +0 -0
  44. devlab_next/asgi.py +16 -0
  45. devlab_next/settings.py +166 -0
  46. devlab_next/urls.py +33 -0
  47. devlab_next/wsgi.py +16 -0
  48. docker-compose.yml +95 -0
  49. download_people.py +14 -0
  50. list_faces.py +23 -0
.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.db
6
+ venv/
7
+ .git/
8
+ nohup.out
9
+ core
.gitignore ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python bytecode files
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ __pycache__/
6
+
7
+ # Virtual environment
8
+ venv/
9
+ env/
10
+
11
+ # Distribution / packaging
12
+ *.egg
13
+ *.egg-info
14
+ dist/
15
+ build/
16
+ *.whl
17
+
18
+ # IDE files
19
+ .idea/
20
+ .vscode/
21
+
22
+ # Jupyter Notebook files
23
+ .ipynb_checkpoints
24
+
25
+ # PyInstaller
26
+ *.manifest
27
+ *.spec
28
+
29
+ # Test and coverage reports
30
+ .coverage
31
+ *.coveragerc
32
+ nosetests.xml
33
+ coverage.xml
34
+ *.coveralls.yml
35
+
36
+ # MyPy
37
+ .mypy_cache/
38
+ .dmypy.json
39
+ dmypy.json
40
+
41
+ # Pytest
42
+ .cache/
43
+
44
+ # Sphinx documentation
45
+ docs/_build/
46
+
47
+ # pytest and flake8
48
+ *.log
49
+
50
+ # VS Code settings
51
+ .vscode/
52
+
53
+ # Django secrets
54
+ *.env
55
+
56
+ # Flask instance folder
57
+ instance/
58
+
59
+ # PyCharm project files
60
+ .idea/
61
+
62
+ # Other Python-related files
63
+ *.bak
64
+ *.swp
65
+ *.swo
66
+ ddet_classification/
67
+ .DS_Store
68
+ .pkl
69
+ people/
70
+ people_backup/
71
+ *.mp3
72
+ *.wav
73
+ media/uploads/
74
+ media/vtt/
75
+ volumes/
76
+ output/
77
+ reports/
78
+ data/
79
+ ai_api/library/data/
80
+ ai_api/library/output/
81
+ ai_api/library/cache/
82
+ ai_api/library/reports/
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV TF_CPP_MIN_LOG_LEVEL=2
6
+
7
+ # Install dependencies
8
+ #RUN apt-get update && apt-get install -y exiftool ffmpeg curl libglib2.0-0 libsm6 libxext6 libxrender-dev
9
+ # Install Chrome & dependencies
10
+ RUN apt-get update && apt-get install -y \
11
+ wget unzip curl gnupg exiftool ffmpeg \
12
+ fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libatk1.0-0 libcups2 libdbus-1-3 libgdk-pixbuf2.0-0 \
13
+ libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1 \
14
+ chromium chromium-driver \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Set work directory
18
+ WORKDIR /app
19
+
20
+ # Copy project files
21
+ COPY . /app
22
+
23
+ # Install Python packages
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Expose port
27
+ EXPOSE 8000
28
+
29
+ # Run app using Gunicorn
30
+ #CMD ["gunicorn", "--bind", "0.0.0.0:8000", "devlab_next.wsgi:application"]
31
+ CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]
32
+
ai_api/.gitignore ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python bytecode files
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ __pycache__/
6
+
7
+ # Virtual environment
8
+ venv/
9
+ env/
10
+
11
+ # Distribution / packaging
12
+ *.egg
13
+ *.egg-info
14
+ dist/
15
+ build/
16
+ *.whl
17
+
18
+ # IDE files
19
+ .idea/
20
+ .vscode/
21
+
22
+ # Jupyter Notebook files
23
+ .ipynb_checkpoints
24
+
25
+ # PyInstaller
26
+ *.manifest
27
+ *.spec
28
+
29
+ # Test and coverage reports
30
+ .coverage
31
+ *.coveragerc
32
+ nosetests.xml
33
+ coverage.xml
34
+ *.coveralls.yml
35
+
36
+ # MyPy
37
+ .mypy_cache/
38
+ .dmypy.json
39
+ dmypy.json
40
+
41
+ # Pytest
42
+ .cache/
43
+
44
+ # Sphinx documentation
45
+ docs/_build/
46
+
47
+ # pytest and flake8
48
+ *.log
49
+
50
+ # VS Code settings
51
+ .vscode/
52
+
53
+ # Django secrets
54
+ *.env
55
+
56
+ # Flask instance folder
57
+ instance/
58
+
59
+ # PyCharm project files
60
+ .idea/
61
+
62
+ # Other Python-related files
63
+ *.bak
64
+ *.swp
65
+ *.swo
66
+ ddet_classification/
67
+ .DS_Store
68
+ .pkl
ai_api/__init__.py ADDED
File without changes
ai_api/admin.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.contrib import admin
2
+ from .models import APIClient
3
+
4
+ # admin.site.register(APIClient)
5
+
6
+ @admin.register(APIClient)
7
+ class APIClientAdmin(admin.ModelAdmin):
8
+ list_display = ('name', 'client_id', 'created_at')
9
+ readonly_fields = ('client_id', 'secret_key', 'created_at')
10
+ fields = ('name', 'client_id', 'secret_key', 'created_at') # show in form
ai_api/api.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.shortcuts import render
2
+ from django.http import JsonResponse
3
+ from .forms import ImageUploadForm, ClassificationForm, RegisterFaceForm,TranscribeForm, YouTubeURLForm
4
+ import shutil
5
+ from django.conf import settings
6
+ import torch
7
+ import json
8
+ import os
9
+ from PIL import Image as PILImage
10
+ import io
11
+ import tempfile
12
+ from django.core.cache import cache
13
+ import numpy as numpy_lib
14
+ import pickle
15
+ from deepface import DeepFace
16
+ import cv2
17
+ import base64
18
+ from io import BytesIO
19
+ from . import globals
20
+ import tempfile
21
+ import mimetypes
22
+ import subprocess
23
+ import logging
24
+ import uuid
25
+ import yt_dlp
26
+ import time
27
+ import re
28
+ from pydub import AudioSegment
29
+ import pandas as pd
30
+ import csv
31
+ from .models import APIClient
32
+
33
+ API_VERSION = '1.0.0'
34
+
35
+ def index(request):
36
+ return JsonResponse({'message': 'Welcome to the BERNAMA Fact Check API', 'version': API_VERSION})
37
+
38
+ def clients(request):
39
+ # if not hasattr(request, 'api_client'):
40
+ # return JsonResponse({'error': 'Unauthorized'}, status=401)
41
+
42
+ clients = list(APIClient.objects.values('name', 'client_id', 'created_at'))
43
+ return JsonResponse({'clients': clients})
44
+
ai_api/api_urls.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.urls import path
2
+ from . import api, controllers
3
+
4
+ urlpatterns = [
5
+ path('', api.index, name='index'),
6
+ path('ping/', api.index, name='index'),
7
+ path('clients/', api.clients, name='clients'),
8
+ path('transcription/', controllers.transcription.TranscriptionAPIView.as_view(), name='transcription'),
9
+ path('classification/', controllers.classification.ClassificationAPIView.as_view(), name='classification'),
10
+ ]
ai_api/apps.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.apps import AppConfig
2
+
3
+ class AiApiConfig(AppConfig):
4
+ default_auto_field = 'django.db.models.BigAutoField'
5
+ name = 'ai_api'
6
+
7
+ def ready(self):
8
+ from . import globals
9
+ from deepface import DeepFace
10
+ from ai_api.library.devlab_image import DevLabImage
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
+ import whisper
13
+ import os
14
+ from safetensors import safe_open
15
+ import torch
16
+
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ globals.devlab_image = DevLabImage()
20
+
21
+ # Load HuggingFace tokenizer and model once
22
+ save_path = os.path.join(os.path.dirname(__file__), "ddet_classification")
23
+ print(f"Model path: {save_path}")
24
+ globals.save_path = save_path
25
+
26
+ # Load tokenizer
27
+ try:
28
+ globals.tokenizer = AutoTokenizer.from_pretrained(save_path,device=device)
29
+ print("Tokenizer loaded ✅")
30
+ except Exception as e:
31
+ print(f"Failed to load tokenizer: {e}")
32
+ globals.tokenizer = None
33
+
34
+ # Check .safetensors before loading model
35
+ try:
36
+ safetensor_file = os.path.join(save_path, "model.safetensors")
37
+ if os.path.exists(safetensor_file):
38
+ with safe_open(safetensor_file, framework="pt") as f:
39
+ print("Safetensors file checked ✅")
40
+
41
+ globals.model = AutoModelForSequenceClassification.from_pretrained(save_path)
42
+ globals.model.eval()
43
+ print("Classification model loaded ✅")
44
+
45
+ except Exception as e:
46
+ print(f"Failed to load classification model: {e}")
47
+ globals.model = None
48
+
49
+ # Load Whisper model
50
+ try:
51
+ globals.whisper_model = whisper.load_model("large",device=device)
52
+ print("Whisper model loaded ✅")
53
+ except Exception as e:
54
+ print(f"Failed to load Whisper model: {e}")
55
+ globals.whisper_model = None
56
+
57
+ # Load FaceNet model
58
+ try:
59
+ globals.facenet_model = DeepFace.build_model("Facenet")
60
+ print("Facenet model loaded ✅")
61
+ except Exception as e:
62
+ print(f"Failed to load FaceNet model: {e}")
63
+ globals.facenet_model = None
ai_api/controllers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from . import transcription
2
+ from . import classification
ai_api/controllers/classification.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # classification.py
2
+ from rest_framework.views import APIView
3
+ from rest_framework.response import Response
4
+ from rest_framework import status
5
+ from ..request_serializer import ClassificationRequestSerializer
6
+
7
+ class ClassificationAPIView(APIView):
8
+ def get(self, request):
9
+ return Response({"message": "Classification API"})
10
+
11
+ def post(self, request):
12
+ serializer = ClassificationRequestSerializer(data=request.data)
13
+ if serializer.is_valid():
14
+ return Response({"message": "Classification API"})
15
+ return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
ai_api/controllers/transcription.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # transcription.py
2
+ from rest_framework.views import APIView
3
+ from rest_framework.response import Response
4
+ from rest_framework import status
5
+ from ..request_serializer import TranscriptionRequestSerializer
6
+
7
+ class TranscriptionAPIView(APIView):
8
+ def get(self, request):
9
+ return Response({"message": "Transcription API"})
10
+
11
+ def post(self, request):
12
+ serializer = TranscriptionRequestSerializer(data=request.data)
13
+ if serializer.is_valid():
14
+ media_file = request.FILES.get('media')
15
+ return Response({"media_file": media_file.name})
16
+ return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
ai_api/forms.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django import forms
2
+ from .widgets import MultipleFileInput
3
+ from django.core.exceptions import ValidationError
4
+
5
+
6
+ class ImageUploadForm(forms.Form):
7
+ image = forms.ImageField(
8
+ widget=forms.ClearableFileInput(attrs={
9
+ 'class': 'form-control',
10
+ 'capture': 'user'
11
+ })
12
+ )
13
+
14
+ class ClassificationForm(forms.Form):
15
+ claim = forms.CharField(
16
+ label="Claim:",
17
+ widget=forms.Textarea(attrs={
18
+ 'class': 'form-control',
19
+ 'rows': 5,
20
+ 'placeholder': 'Enter your claim or statement',
21
+ })
22
+ )
23
+
24
+ class RegisterFaceForm(forms.Form):
25
+ person = forms.CharField(
26
+ label="Person:",
27
+ widget=forms.TextInput(attrs={
28
+ 'class': 'form-control',
29
+ 'placeholder': 'e.g: ANWAR IBRAHIM',
30
+ })
31
+ )
32
+ keywords = forms.CharField(
33
+ label="Keyword:",
34
+ required=False,
35
+ widget=forms.TextInput(attrs={
36
+ 'class': 'form-control',
37
+ 'placeholder': 'e.g: Prime Minister of Malaysia',
38
+ })
39
+ )
40
+ images = forms.FileField(
41
+ required=False,
42
+ widget=MultipleFileInput(attrs={
43
+ 'multiple': True,
44
+ 'class': 'form-control',
45
+ 'capture': 'user'
46
+ })
47
+ )
48
+
49
+ class TranscribeForm(forms.Form):
50
+ url = forms.CharField(
51
+ label="YouTube URL:",
52
+ required=False,
53
+ widget=forms.TextInput(attrs={
54
+ 'type': 'url',
55
+ 'class': 'form-control',
56
+ 'placeholder': 'Enter YouTube URL',
57
+
58
+ })
59
+ )
60
+ file = forms.FileField(
61
+ label="Upload Audio/Video File",
62
+ required=False,
63
+ widget=forms.ClearableFileInput(attrs={
64
+ 'class': 'form-control',
65
+ 'accept': 'audio/*,video/*',
66
+
67
+ })
68
+ )
69
+ def clean(self):
70
+ cleaned_data = super().clean()
71
+ url = cleaned_data.get("url")
72
+ file = cleaned_data.get("file")
73
+
74
+ if not url and not file:
75
+ raise ValidationError("You must provide either a YouTube URL or upload a file.")
76
+ if url and file:
77
+ raise ValidationError("Please provide only one: YouTube URL or a file upload.")
78
+
79
+ class YouTubeURLForm(forms.Form):
80
+ youtube_url = forms.URLField(
81
+ label='YouTube Video URL',
82
+ widget=forms.URLInput(attrs={
83
+ 'class': 'form-control',
84
+ 'placeholder': 'https://www.youtube.com/watch?v=example'
85
+ })
86
+ )
ai_api/globals.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ devlab_image = None
2
+ tokenizer = None
3
+ model = None
4
+ save_path = None
5
+ whisper_model = None
6
+ facenet_model = None
ai_api/library/apify_scraper.py ADDED
@@ -0,0 +1,893 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # apify_scraper.py
2
+ # Updated version: Uses separate Apify tokens for Facebook and TikTok tasks
3
+
4
+ import requests
5
+ import time
6
+ import pandas as pd
7
+ import os
8
+ import json
9
+ import hashlib
10
+ from datetime import datetime, timedelta
11
+
12
+ # Create cache directory
13
+ CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")
14
+ os.makedirs(CACHE_DIR, exist_ok=True)
15
+
16
+ # Import configuration settings
17
+ try:
18
+ from .config import (
19
+ # API tokens
20
+ APIFY_TOKEN, APIFY_TOKEN_FB, APIFY_TOKEN_TIKTOK,
21
+ # Task IDs
22
+ POST_TASK_ID_SEARCH, COMMENT_TASK_ID, TIKTOK_VIDEO_TASK_ID, TIKTOK_COMMENT_TASK_ID,
23
+ # Data source settings
24
+ USE_FACEBOOK, USE_TIKTOK, USE_SERPAPI, USE_SERPER, USE_DUCKDUCKGO, USE_LOWYAT,
25
+ # Comment settings
26
+ USE_COMMENTS,
27
+ # Result limits
28
+ FACEBOOK_MAX_RESULTS, TIKTOK_MAX_RESULTS, WEB_SEARCH_MAX_RESULTS, LOWYAT_MAX_THREADS,
29
+ # Lowyat Forum settings
30
+ LOWYAT_SECTIONS
31
+ )
32
+ # Use settings from config
33
+ print("[✓] Using configuration from config.py")
34
+ except ImportError:
35
+ # Fallback to hardcoded settings
36
+ print("[⚠️] Config not found, using hardcoded settings")
37
+ # API tokens
38
+ APIFY_TOKEN = "apify_api_INtF6uUT4c6nOStYDYTllxuTBNSbng1IlTTB"
39
+ #APIFY_TOKEN_FB = APIFY_TOKEN
40
+ #APIFY_TOKEN_TIKTOK = APIFY_TOKEN
41
+
42
+ # Actor task IDs
43
+ #POST_TASK_ID_SEARCH = "l5DitJrtfCyOfrjn6" # Facebook Search PPR (rajamohd/facebook-search-ppr-rm-bernama)
44
+ #COMMENT_TASK_ID = "qiAp6PQwkyYcLQiyC" # Facebook Comments Scraper (rajamohd/facebook-comments-scraper-task)
45
+ TIKTOK_VIDEO_TASK_ID = "rfk0BzRAjuLPbccaZ" # TikTok Data Extractor (devlab/tiktok-data-extractor-bernama2-video)
46
+ TIKTOK_COMMENT_TASK_ID = "rgXeWIhnXKRD5bjGp" # TikTok Comments Scraper (devlab/tiktok-comments-scraper-bernama2)
47
+
48
+
49
+
50
+ # Data source settings
51
+ USE_FACEBOOK = True
52
+ USE_TIKTOK = True
53
+ USE_SERPAPI = True
54
+ USE_SERPER = True
55
+ USE_DUCKDUCKGO = True
56
+ USE_LOWYAT = True
57
+
58
+ # Comment settings
59
+ USE_COMMENTS = True
60
+
61
+ # Result limits
62
+ FACEBOOK_MAX_RESULTS = 100
63
+ TIKTOK_MAX_RESULTS = 50
64
+ WEB_SEARCH_MAX_RESULTS = 20
65
+ LOWYAT_MAX_THREADS = 20
66
+
67
+ # Lowyat Forum settings
68
+ LOWYAT_SECTIONS = ["Kopitiam", "SeriousKopitiam", "Finance"]
69
+
70
+ def run(keywords, output_path="output/claim_data.csv", fetch_comments=True, max_videos=30, max_comments=50, max_results=None):
71
+ """Run data collection from multiple sources and combine results
72
+
73
+ Args:
74
+ keywords (list): List of keywords to search for
75
+ output_path (str): Path to save combined results
76
+ fetch_comments (bool): Whether to fetch comments for TikTok videos
77
+ max_videos (int): Maximum number of TikTok videos to fetch per keyword
78
+ max_comments (int): Maximum number of comments to fetch per TikTok video
79
+ max_results (int): Maximum results per source (overrides config settings)
80
+
81
+ Returns:
82
+ pandas.DataFrame: Combined results from all sources
83
+ """
84
+ all_records = []
85
+
86
+ # Use config settings if max_results not specified
87
+ fb_max = max_results or FACEBOOK_MAX_RESULTS
88
+ tiktok_max = max_results or TIKTOK_MAX_RESULTS
89
+ web_max = max_results or WEB_SEARCH_MAX_RESULTS
90
+
91
+ # Create output directory if it doesn't exist
92
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
93
+ # os.makedirs(output_path, exist_ok=True)
94
+
95
+ # Create a summary of data sources
96
+ sources_enabled = []
97
+ if USE_FACEBOOK: sources_enabled.append("Facebook")
98
+ if USE_TIKTOK: sources_enabled.append("TikTok")
99
+ if USE_SERPAPI: sources_enabled.append("SerpApi")
100
+ if USE_SERPER: sources_enabled.append("Serper.dev")
101
+ if USE_DUCKDUCKGO: sources_enabled.append("DuckDuckGo")
102
+ if USE_LOWYAT: sources_enabled.append("Lowyat Forum")
103
+
104
+ print(f"[📊] Data collection enabled for: {', '.join(sources_enabled)}")
105
+ print(f"[🔍] Original Keywords: {', '.join(keywords)}")
106
+
107
+ # Optimize keywords for different platforms
108
+ try:
109
+ from tiktok_keyword_formatter import optimize_keywords_for_platforms
110
+ optimized_keywords = optimize_keywords_for_platforms(keywords)
111
+ tiktok_keywords = optimized_keywords["tiktok"]
112
+ web_keywords = optimized_keywords["web_search"]
113
+
114
+ print(f"[🔍] TikTok Keywords: {', '.join(tiktok_keywords)}")
115
+ print(f"[🔍] Web Search Keywords: {', '.join(web_keywords)}")
116
+ except ImportError:
117
+ print("[⚠️] Keyword formatter not found. Using original keywords for all platforms.")
118
+ tiktok_keywords = keywords
119
+ web_keywords = keywords
120
+
121
+ # Facebook post search
122
+ if USE_FACEBOOK:
123
+ try:
124
+ boolean_query = build_boolean_search(keywords)
125
+ print(f"[📘] Facebook: {boolean_query}")
126
+ post_input = {"search": boolean_query, "resultsPerPage": min(fb_max, 100)}
127
+
128
+ post_dataset_id = run_actor_task(POST_TASK_ID_SEARCH, post_input, platform="facebook")
129
+ posts = download_dataset(post_dataset_id, platform="facebook")
130
+ print(f"[📘] Retrieved {len(posts)} Facebook posts")
131
+
132
+ fb_records = []
133
+ for post in posts:
134
+ # Check if this is Malaysian content
135
+ username = post.get("username", "")
136
+ text = post.get("text", "")
137
+ post_url = post.get("url")
138
+
139
+ if is_malaysian_content(username, text):
140
+ # Add the post itself
141
+ post_record = {
142
+ "platform": "facebook",
143
+ "date": post.get("createdAt"),
144
+ "username": username,
145
+ "post_text": text,
146
+ "post_url": post_url,
147
+ "likes": post.get("likes", 0),
148
+ "shares": post.get("shares", 0),
149
+ "comments_count": post.get("commentsCount", 0),
150
+ "comment_text": "",
151
+ "combined_text": text
152
+ }
153
+ fb_records.append(post_record)
154
+
155
+ # If comments are enabled and the post has comments, scrape them
156
+ if USE_COMMENTS and post.get("commentsCount", 0) > 0 and post_url:
157
+ try:
158
+ print(f"[💬] Scraping comments for Facebook post: {post_url}")
159
+ comment_input = {"url": post_url, "maxComments": 50}
160
+ comment_dataset_id = run_actor_task(COMMENT_TASK_ID, comment_input, platform="facebook")
161
+ comments = download_dataset(comment_dataset_id, platform="facebook")
162
+ print(f"[💬] Retrieved {len(comments)} comments for post")
163
+
164
+ for comment in comments:
165
+ comment_text = comment.get("text", "")
166
+ comment_username = comment.get("name", "")
167
+
168
+ if is_malaysian_content(comment_username, comment_text):
169
+ comment_record = {
170
+ "platform": "facebook_comment",
171
+ "date": comment.get("date"),
172
+ "username": comment_username,
173
+ "post_text": "",
174
+ "post_url": post_url,
175
+ "likes": comment.get("likes", 0),
176
+ "shares": 0,
177
+ "comments_count": 0,
178
+ "comment_text": comment_text,
179
+ "combined_text": comment_text
180
+ }
181
+ fb_records.append(comment_record)
182
+ except Exception as e:
183
+ print(f"[❌] Error scraping comments for post {post_url}: {str(e)}")
184
+ print("[⚠️] Continuing with next post...")
185
+
186
+ print(f"[📊] Added {len(fb_records)} Facebook records after filtering")
187
+ all_records.extend(fb_records)
188
+ except Exception as e:
189
+ print(f"[❌] Error during Facebook scraping: {str(e)}")
190
+ print("[⚠️] Continuing with other data sources...")
191
+
192
+ # TikTok scraping
193
+ if USE_TIKTOK:
194
+ try:
195
+ print(f"[📽️] TikTok: Searching for {', '.join(tiktok_keywords)}")
196
+ tiktok_records = []
197
+
198
+ # Use only the top 3 most relevant keywords as requested
199
+ top_keywords = tiktok_keywords[:min(3, len(tiktok_keywords))]
200
+ print(f"[📽️] Using top {len(top_keywords)} TikTok keywords: {', '.join(top_keywords)}")
201
+
202
+ # Set video limits as requested by user
203
+ videos_per_keyword = max_videos # Use the parameter value
204
+
205
+ # No total video limit - collect exactly max_videos per keyword
206
+ total_videos_collected = 0
207
+ max_total_videos = max_videos * len(top_keywords) # Allow max_videos per keyword
208
+
209
+ # for keyword in top_keywords:
210
+ try:
211
+ # Print detailed debugging information
212
+ print(f"[📽️] DEBUG: TikTok API Token: {APIFY_TOKEN_TIKTOK[:5]}...{APIFY_TOKEN_TIKTOK[-5:]}")
213
+ print(f"[📽️] DEBUG: TikTok Video Task ID: {TIKTOK_VIDEO_TASK_ID}")
214
+ print(f"[📽️] DEBUG: TikTok Comment Task ID: {TIKTOK_COMMENT_TASK_ID}")
215
+
216
+ keyword = ', '.join(tiktok_keywords)
217
+
218
+ # Limit videos per keyword to save costs
219
+ tiktok_input = { "searchQueries": [keyword], "maxVideos": videos_per_keyword}
220
+ # tiktok_input ={"searchQueries": keyword}
221
+ print(f"[📽️] Requesting {videos_per_keyword} TikTok videos for: {keyword}")
222
+ print(f"[📽️] DEBUG: Full input payload: {tiktok_input}")
223
+
224
+
225
+ try:
226
+ tiktok_dataset_id = run_actor_task(TIKTOK_VIDEO_TASK_ID, tiktok_input, platform="tiktok")
227
+ print(f"[📽️] DEBUG: Successfully got dataset ID: {tiktok_dataset_id}")
228
+ videos = download_dataset(tiktok_dataset_id, platform="tiktok")
229
+ print(f"[📽️] Retrieved {len(videos)} TikTok videos for: {keyword}")
230
+ except Exception as e:
231
+ print(f"[❌] DETAILED ERROR in TikTok video extraction: {str(e)}")
232
+ print(f"[❌] Error type: {type(e).__name__}")
233
+ import traceback
234
+ print(f"[❌] Traceback: {traceback.format_exc()}")
235
+ videos = []
236
+
237
+ for video in videos:
238
+ # Check if we've reached the maximum total videos limit
239
+ if total_videos_collected >= max_total_videos:
240
+ print(f"[⚠️] Reached maximum limit of {max_total_videos} videos. Stopping collection.")
241
+ break
242
+
243
+ username = video.get("authorMeta", {}).get("userName", "") or video.get("authorMeta", {}).get("name", "")
244
+ caption = video.get("text", "")
245
+
246
+ if is_malaysian_content(username, caption):
247
+ # Increment the total videos counter
248
+ total_videos_collected += 1
249
+ video_url = video.get("webVideoUrl") or video.get("videoUrl")
250
+ clean_url = video_url.split("?")[0] if video_url and "/video/" in video_url else None
251
+
252
+ video_record = {
253
+ "platform": "tiktok",
254
+ "date": video.get("createTimeISO") or video.get("createTime"),
255
+ "username": username,
256
+ "post_text": caption,
257
+ "post_url": clean_url,
258
+ "likes": video.get("diggCount", 0),
259
+ "shares": video.get("shareCount", 0),
260
+ "comments_count": video.get("commentCount", 0),
261
+ "comment_text": "",
262
+ "combined_text": caption
263
+ }
264
+
265
+ tiktok_records.append(video_record)
266
+
267
+ # If comments are enabled and the video has comments, scrape them
268
+ # Get comments per video as requested by the user
269
+ min_comments_threshold = 5 # Lower threshold to ensure we get comments
270
+ max_comments_to_scrape = max_comments # Use the parameter value
271
+ max_videos_with_comments = 10 # Allow more videos with comments
272
+
273
+ # Track how many videos we've scraped comments for
274
+ if not hasattr(run, 'videos_with_comments_count'):
275
+ run.videos_with_comments_count = 0
276
+
277
+ if (fetch_comments and
278
+ run.videos_with_comments_count < max_videos_with_comments and
279
+ video.get("commentCount", 0) >= min_comments_threshold and
280
+ clean_url and
281
+ video.get("diggCount", 0) > 10): # Very low threshold to ensure we get comments for most videos
282
+ try:
283
+ print(f"[💬] Scraping comments for popular TikTok video ({run.videos_with_comments_count+1}/{max_videos_with_comments}): {clean_url}")
284
+ comment_input = {"postURLs": [clean_url], "commentsPerPost": max_comments_to_scrape}
285
+ print(f"[💬] DEBUG: Comment input payload: {comment_input}")
286
+
287
+ try:
288
+ comment_dataset_id = run_actor_task(TIKTOK_COMMENT_TASK_ID, comment_input, platform="tiktok")
289
+ print(f"[💬] DEBUG: Successfully got comment dataset ID: {comment_dataset_id}")
290
+ comments = download_dataset(comment_dataset_id, platform="tiktok")
291
+ run.videos_with_comments_count += 1
292
+ print(f"[💬] Retrieved {len(comments)} comments for video")
293
+ except Exception as e:
294
+ print(f"[❌] DETAILED ERROR in TikTok comment extraction: {str(e)}")
295
+ print(f"[❌] Error type: {type(e).__name__}")
296
+ import traceback
297
+ print(f"[❌] Traceback: {traceback.format_exc()}")
298
+ comments = []
299
+
300
+ for comment in comments:
301
+ comment_text = comment.get("text", "")
302
+ comment_username = comment.get("author", {}).get("uniqueId", "") or comment.get("author", {}).get("nickname", "")
303
+
304
+ if is_malaysian_content(comment_username, comment_text):
305
+ comment_record = {
306
+ "platform": "tiktok_comment",
307
+ "date": comment.get("createTime"),
308
+ "username": comment_username,
309
+ "post_text": "",
310
+ "post_url": clean_url,
311
+ "likes": comment.get("diggCount", 0),
312
+ "shares": 0,
313
+ "comments_count": 0,
314
+ "comment_text": comment_text,
315
+ "combined_text": comment_text
316
+ }
317
+ tiktok_records.append(comment_record)
318
+ except Exception as e:
319
+ print(f"[❌] Error scraping comments for video {clean_url}: {str(e)}")
320
+ print("[⚠️] Continuing with next video...")
321
+ # Check if we've reached the maximum total videos limit after processing this keyword
322
+ if total_videos_collected >= max_total_videos:
323
+ print(f"[⚠️] Reached maximum limit of {max_total_videos} videos. Stopping keyword search.")
324
+ break
325
+ except Exception as e:
326
+ print(f"[❌] Error processing TikTok keyword '{keyword}': {str(e)}")
327
+ print("[⚠️] Continuing with next keyword...")
328
+
329
+ print(f"[📊] Added {len(tiktok_records)} TikTok records after filtering")
330
+ all_records.extend(tiktok_records)
331
+ except Exception as e:
332
+ print(f"[❌] Error during TikTok scraping: {str(e)}")
333
+ print("[⚠️] Continuing with other data sources...")
334
+
335
+ # Web search (SerpApi, Serper.dev, DuckDuckGo)
336
+ if USE_SERPAPI or USE_SERPER or USE_DUCKDUCKGO:
337
+ try:
338
+ print(f"[🌐] Web Search: Searching for {', '.join(web_keywords)}")
339
+ web_search_output = f"output/{os.path.basename(output_path).split('.')[0]}_web.csv"
340
+
341
+ # Try to import the run_web_search function
342
+ try:
343
+ from run_web_search import run_web_search
344
+
345
+ # Get the full claim from the environment if available
346
+ full_claim = os.environ.get("FULL_CLAIM", None)
347
+ if full_claim:
348
+ print(f"[🔍] Using full claim for web search: {full_claim}")
349
+
350
+ # Pass configuration settings to run_web_search
351
+ web_results_count = run_web_search(
352
+ web_keywords,
353
+ web_search_output,
354
+ num_results=web_max,
355
+ use_serpapi=USE_SERPAPI,
356
+ use_serper=USE_SERPER,
357
+ use_duckduckgo=USE_DUCKDUCKGO,
358
+ full_claim=full_claim
359
+ )
360
+ print(f"[🌐] Retrieved {web_results_count} web search results")
361
+
362
+ # If web search was successful, read the results and add to all_records
363
+ if web_results_count > 0:
364
+ try:
365
+ web_df = pd.read_csv(web_search_output)
366
+ web_records = web_df.to_dict('records')
367
+ all_records.extend(web_records)
368
+ print(f"[📊] Added {len(web_records)} web search records")
369
+ except Exception as e:
370
+ print(f"[❌] Error reading web search results: {str(e)}")
371
+ except ImportError:
372
+ print("[⚠️] Web search module not found. Skipping web search.")
373
+ except Exception as e:
374
+ print(f"[❌] Error during web search: {str(e)}")
375
+
376
+ # Lowyat Forum data collection
377
+ if USE_LOWYAT:
378
+ try:
379
+ print(f"[📚] Collecting data from Lowyat Forum...")
380
+
381
+ # Import the Lowyat Forum crawler
382
+ try:
383
+ from lowyat_crawler import run_lowyat_crawler
384
+
385
+ # Use the same keywords for Lowyat Forum
386
+ lowyat_keywords = keywords
387
+
388
+ # Check for environment variable override for sections
389
+ sections_to_use = LOWYAT_SECTIONS
390
+ if os.environ.get("LOWYAT_SECTIONS"):
391
+ sections_to_use = os.environ.get("LOWYAT_SECTIONS").split(",")
392
+ print(f"[📚] Using Lowyat Forum sections from environment: {', '.join(sections_to_use)}")
393
+
394
+ # Get the full claim from the environment if available
395
+ full_claim = os.environ.get("FULL_CLAIM", None)
396
+ if full_claim:
397
+ print(f"[🔍] Using full claim for Lowyat Forum search: {full_claim}")
398
+
399
+ # Get Lowyat Forum data
400
+ lowyat_output_path = output_path.replace(".csv", "_lowyat.csv")
401
+ try:
402
+ lowyat_df = run_lowyat_crawler(
403
+ lowyat_keywords,
404
+ sections=sections_to_use,
405
+ max_threads=LOWYAT_MAX_THREADS,
406
+ output_path=lowyat_output_path,
407
+ full_claim=full_claim
408
+ )
409
+
410
+ # Convert DataFrame to records and add to all_records
411
+ if not lowyat_df.empty:
412
+ lowyat_records = lowyat_df.to_dict('records')
413
+ all_records.extend(lowyat_records)
414
+ print(f"[📚] Added {len(lowyat_records)} Lowyat Forum records")
415
+ else:
416
+ print(f"[⚠️] No Lowyat Forum data found for keywords: {', '.join(lowyat_keywords)}")
417
+
418
+ # Generate sample data for testing if needed
419
+ if os.environ.get("GENERATE_SAMPLE_LOWYAT_DATA", "false").lower() == "true":
420
+ print("[📚] Generating sample Lowyat Forum data for testing...")
421
+
422
+ # Create a sample dataframe with the claim
423
+ from datetime import datetime
424
+ current_date = datetime.now().strftime('%Y-%m-%d')
425
+
426
+ # Get the claim text or keywords
427
+ claim_text = full_claim if full_claim else ', '.join(lowyat_keywords)
428
+
429
+ # Create relevant sample data based on claim content
430
+ sample_data = []
431
+
432
+ # Check for different types of claims and create relevant sample data
433
+ if any(term in claim_text.lower() for term in ['hon', 'tenonet', 'kenderaan', 'kereta']):
434
+ # Horn/vehicle related claim
435
+ sample_data.append({
436
+ 'platform': 'LowyatForum',
437
+ 'date': current_date,
438
+ 'username': 'CarEnthusiast',
439
+ 'post_text': f"Adakah sesiapa tahu tentang undang-undang berkaitan hon tenonet? Saya dengar JPJ sedang menjalankan operasi terhadap kenderaan yang menggunakan hon jenis ini.",
440
+ 'post_url': 'https://forum.lowyat.net/topic/hon-tenonet',
441
+ 'likes': 15,
442
+ 'shares': 3,
443
+ 'comments_count': 8,
444
+ 'comment_text': '',
445
+ 'combined_text': f"Adakah sesiapa tahu tentang undang-undang berkaitan hon tenonet? Saya dengar JPJ sedang menjalankan operasi terhadap kenderaan yang menggunakan hon jenis ini."
446
+ })
447
+
448
+ sample_data.append({
449
+ 'platform': 'LowyatForum_Comment',
450
+ 'date': current_date,
451
+ 'username': 'LegalExpert',
452
+ 'post_text': '',
453
+ 'post_url': 'https://forum.lowyat.net/topic/hon-tenonet#comment1',
454
+ 'likes': 7,
455
+ 'shares': 0,
456
+ 'comments_count': 0,
457
+ 'comment_text': "Ya, penggunaan hon tenonet adalah menyalahi undang-undang kerana boleh mengelirukan pemandu lain dan menyebabkan kemalangan. Denda boleh mencecah RM2,000.",
458
+ 'combined_text': "Ya, penggunaan hon tenonet adalah menyalahi undang-undang kerana boleh mengelirukan pemandu lain dan menyebabkan kemalangan. Denda boleh mencecah RM2,000."
459
+ })
460
+
461
+ elif any(term in claim_text.lower() for term in ['kelantan', 'rogol', 'sumbang mahram', 'jenayah']):
462
+ # Crime in Kelantan related claim
463
+ sample_data.append({
464
+ 'platform': 'LowyatForum',
465
+ 'date': current_date,
466
+ 'username': 'SocialObserver',
467
+ 'post_text': f"Statistik jenayah seksual di Kelantan semakin membimbangkan. Menurut laporan polis, kes rogol dan sumbang mahram meningkat sebanyak 15% tahun ini.",
468
+ 'post_url': 'https://forum.lowyat.net/topic/crime-statistics',
469
+ 'likes': 12,
470
+ 'shares': 5,
471
+ 'comments_count': 7,
472
+ 'comment_text': '',
473
+ 'combined_text': f"Statistik jenayah seksual di Kelantan semakin membimbangkan. Menurut laporan polis, kes rogol dan sumbang mahram meningkat sebanyak 15% tahun ini."
474
+ })
475
+
476
+ sample_data.append({
477
+ 'platform': 'LowyatForum_Comment',
478
+ 'date': current_date,
479
+ 'username': 'CommunityLeader',
480
+ 'post_text': '',
481
+ 'post_url': 'https://forum.lowyat.net/topic/crime-statistics#comment1',
482
+ 'likes': 8,
483
+ 'shares': 0,
484
+ 'comments_count': 0,
485
+ 'comment_text': "Kita perlu lebih banyak program kesedaran dan pendidikan untuk menangani masalah ini. Pihak berkuasa juga perlu mengambil tindakan lebih tegas terhadap pesalah.",
486
+ 'combined_text': "Kita perlu lebih banyak program kesedaran dan pendidikan untuk menangani masalah ini. Pihak berkuasa juga perlu mengambil tindakan lebih tegas terhadap pesalah."
487
+ })
488
+
489
+ elif any(term in claim_text.lower() for term in ['kelongsong', 'peluru', 'senjata', 'tan']):
490
+ # Ammunition related claim
491
+ sample_data.append({
492
+ 'platform': 'LowyatForum',
493
+ 'date': current_date,
494
+ 'username': 'SecurityAnalyst',
495
+ 'post_text': f"Penemuan 50 tan kelongsong dan peluru di kilang haram membimbangkan. Adakah ini menunjukkan ancaman keselamatan yang serius?",
496
+ 'post_url': 'https://forum.lowyat.net/topic/security-threat',
497
+ 'likes': 25,
498
+ 'shares': 10,
499
+ 'comments_count': 15,
500
+ 'comment_text': '',
501
+ 'combined_text': f"Penemuan 50 tan kelongsong dan peluru di kilang haram membimbangkan. Adakah ini menunjukkan ancaman keselamatan yang serius?"
502
+ })
503
+
504
+ sample_data.append({
505
+ 'platform': 'LowyatForum_Comment',
506
+ 'date': current_date,
507
+ 'username': 'DefenseExpert',
508
+ 'post_text': '',
509
+ 'post_url': 'https://forum.lowyat.net/topic/security-threat#comment1',
510
+ 'likes': 18,
511
+ 'shares': 0,
512
+ 'comments_count': 0,
513
+ 'comment_text': "Menurut sumber, kelongsong tersebut adalah untuk dikitar semula dan bukan untuk kegunaan senjata aktif. Namun, ia tetap menyalahi undang-undang kerana tidak mempunyai permit yang sah.",
514
+ 'combined_text': "Menurut sumber, kelongsong tersebut adalah untuk dikitar semula dan bukan untuk kegunaan senjata aktif. Namun, ia tetap menyalahi undang-undang kerana tidak mempunyai permit yang sah."
515
+ })
516
+
517
+ elif any(term in claim_text.lower() for term in ['minyak sawit', 'cukai', 'ekonomi']):
518
+ # Palm oil tax related claim
519
+ sample_data.append({
520
+ 'platform': 'LowyatForum',
521
+ 'date': current_date,
522
+ 'username': 'EconomyWatcher',
523
+ 'post_text': f"Adakah benar kerajaan akan mengenakan cukai khas terhadap minyak sawit mentah? Ini akan memberi kesan besar kepada industri dan ekonomi negara.",
524
+ 'post_url': 'https://forum.lowyat.net/topic/palm-oil-tax',
525
+ 'likes': 20,
526
+ 'shares': 8,
527
+ 'comments_count': 12,
528
+ 'comment_text': '',
529
+ 'combined_text': f"Adakah benar kerajaan akan mengenakan cukai khas terhadap minyak sawit mentah? Ini akan memberi kesan besar kepada industri dan ekonomi negara."
530
+ })
531
+
532
+ sample_data.append({
533
+ 'platform': 'LowyatForum_Comment',
534
+ 'date': current_date,
535
+ 'username': 'IndustryInsider',
536
+ 'post_text': '',
537
+ 'post_url': 'https://forum.lowyat.net/topic/palm-oil-tax#comment1',
538
+ 'likes': 15,
539
+ 'shares': 0,
540
+ 'comments_count': 0,
541
+ 'comment_text': "Menurut sumber dari kementerian, cadangan cukai ini masih dalam peringkat kajian dan belum ada keputusan muktamad. Namun, jika dilaksanakan, ia akan memberi kesan kepada harga minyak masak.",
542
+ 'combined_text': "Menurut sumber dari kementerian, cadangan cukai ini masih dalam peringkat kajian dan belum ada keputusan muktamad. Namun, jika dilaksanakan, ia akan memberi kesan kepada harga minyak masak."
543
+ })
544
+
545
+ else:
546
+ # Default generic sample data if no specific claim type is detected
547
+ sample_data.append({
548
+ 'platform': 'LowyatForum',
549
+ 'date': current_date,
550
+ 'username': 'LowyatUser123',
551
+ 'post_text': f"Discussing: {claim_text}",
552
+ 'post_url': 'https://forum.lowyat.net/topic/sample',
553
+ 'likes': 5,
554
+ 'shares': 0,
555
+ 'comments_count': 2,
556
+ 'comment_text': '',
557
+ 'combined_text': f"Discussing: {claim_text}"
558
+ })
559
+
560
+ sample_data.append({
561
+ 'platform': 'LowyatForum_Comment',
562
+ 'date': current_date,
563
+ 'username': 'LowyatCommenter',
564
+ 'post_text': '',
565
+ 'post_url': 'https://forum.lowyat.net/topic/sample#comment1',
566
+ 'likes': 2,
567
+ 'shares': 0,
568
+ 'comments_count': 0,
569
+ 'comment_text': f"Commenting on: {claim_text}",
570
+ 'combined_text': f"Commenting on: {claim_text}"
571
+ })
572
+
573
+ # If no sample data was created (unlikely), create a default one
574
+ if not sample_data:
575
+ sample_data.append({
576
+ 'platform': 'LowyatForum',
577
+ 'date': current_date,
578
+ 'username': 'LowyatUser123',
579
+ 'post_text': f"Discussing: {claim_text}",
580
+ 'post_url': 'https://forum.lowyat.net/topic/sample',
581
+ 'likes': 5,
582
+ 'shares': 0,
583
+ 'comments_count': 2,
584
+ 'comment_text': '',
585
+ 'combined_text': f"Discussing: {claim_text}"
586
+ })
587
+
588
+ sample_df = pd.DataFrame(sample_data)
589
+ if lowyat_output_path:
590
+ sample_df.to_csv(lowyat_output_path, index=False)
591
+
592
+ all_records.extend(sample_data)
593
+ print(f"[📚] Added {len(sample_data)} sample Lowyat Forum records")
594
+ except Exception as e:
595
+ print(f"[⚠️] Error during Lowyat Forum crawling: {str(e)}")
596
+ print("[⚠️] Continuing without Lowyat Forum data...")
597
+
598
+ except ImportError:
599
+ print("[❌] Lowyat Forum crawler module not found. Skipping Lowyat Forum data collection.")
600
+
601
+ except Exception as e:
602
+ print(f"[❌] Error during Lowyat Forum data collection: {str(e)}")
603
+ print("[⚠️] Continuing with other data sources...")
604
+
605
+ # Save all records to CSV
606
+ if all_records:
607
+ df = pd.DataFrame(all_records)
608
+ df.to_csv(output_path, index=False)
609
+ print(f"[💾] Saved {len(df)} records to {output_path}")
610
+
611
+ # Print summary of data sources
612
+ source_counts = df['platform'].value_counts().to_dict()
613
+ print("\n[📊] Data collection summary:")
614
+ for source, count in source_counts.items():
615
+ # Use shorter display names for Lowyat Forum sources
616
+ display_source = source
617
+ if source == "LowyatForum":
618
+ display_source = "LF"
619
+ elif source == "LowyatForum_Comment":
620
+ display_source = "LF_Comment"
621
+ print(f" - {display_source}: {count} records")
622
+
623
+ return df
624
+ else:
625
+ # Create empty DataFrame and save to CSV
626
+ empty_df = pd.DataFrame(columns=["platform", "date", "username", "post_text", "post_url", "likes", "shares", "comments_count", "comment_text", "combined_text"])
627
+ empty_df.to_csv(output_path, index=False)
628
+ print(f"[⚠️] No records found. Saved empty DataFrame to {output_path}")
629
+ return empty_df
630
+
631
+ def run_actor_task(task_id, input_payload, platform="facebook", timeout=30, max_retries=3, use_cache=True, cache_ttl_hours=24):
632
+ # Generate a cache key based on task_id and input_payload
633
+ cache_key = f"{task_id}_{json.dumps(input_payload, sort_keys=True)}"
634
+ cache_hash = hashlib.md5(cache_key.encode()).hexdigest()
635
+ cache_file = os.path.join(CACHE_DIR, f"{cache_hash}.json")
636
+
637
+ # Check if we have a valid cached result
638
+ if use_cache and os.path.exists(cache_file):
639
+ try:
640
+ with open(cache_file, 'r') as f:
641
+ cache_data = json.load(f)
642
+
643
+ # Check if cache is still valid
644
+ cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
645
+ cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
646
+
647
+ if datetime.now() < cache_expiry:
648
+ print(f"[💾] Using cached result for task {task_id} (expires {cache_expiry.isoformat()})")
649
+ return cache_data.get('dataset_id')
650
+ else:
651
+ print(f"[⏰] Cache expired for task {task_id}, fetching fresh data")
652
+ except Exception as e:
653
+ print(f"[⚠️] Error reading cache: {str(e)}")
654
+
655
+ token = APIFY_TOKEN_FB if platform == "facebook" else APIFY_TOKEN_TIKTOK
656
+ headers = {
657
+ "Authorization": f"Bearer {token}",
658
+ "Content-Type": "application/json"
659
+ }
660
+ url = f"https://api.apify.com/v2/actor-tasks/{task_id}/runs"
661
+
662
+ # Try multiple times in case of network issues
663
+ for attempt in range(max_retries):
664
+ try:
665
+ print(f"[🔄] Attempt {attempt+1}/{max_retries} to run task {task_id}...")
666
+ print(input_payload)
667
+ # response = requests.post(url, json={"input": input_payload}, headers=headers, timeout=timeout)
668
+ response = requests.post(url, json=input_payload, headers=headers, timeout=timeout)
669
+
670
+ if response.status_code != 201:
671
+ print(f"[❌] Failed to run task: {response.text}")
672
+ if attempt < max_retries - 1:
673
+ print("[⏳] Retrying...")
674
+ time.sleep(5) # Wait 5 seconds before retrying
675
+ continue
676
+ raise Exception(f"Task run failed after {max_retries} attempts.")
677
+
678
+ run_id = response.json()["data"]["id"]
679
+ print(f"[🟢] Task {task_id} started: {run_id}")
680
+ status_url = f"https://api.apify.com/v2/actor-runs/{run_id}"
681
+ break # Success, exit the retry loop
682
+ except requests.exceptions.Timeout:
683
+ print(f"[❌] Request timed out after {timeout} seconds")
684
+ if attempt < max_retries - 1:
685
+ print("[⏳] Retrying...")
686
+ time.sleep(5) # Wait 5 seconds before retrying
687
+ else:
688
+ raise Exception(f"Task run timed out after {max_retries} attempts.")
689
+ except requests.exceptions.ConnectionError:
690
+ print(f"[❌] Connection error")
691
+ if attempt < max_retries - 1:
692
+ print("[⏳] Retrying...")
693
+ time.sleep(5) # Wait 5 seconds before retrying
694
+ else:
695
+ raise Exception(f"Connection error after {max_retries} attempts.")
696
+ except Exception as e:
697
+ print(f"[❌] Unexpected error: {str(e)}")
698
+ if attempt < max_retries - 1:
699
+ print("[⏳] Retrying...")
700
+ time.sleep(5) # Wait 5 seconds before retrying
701
+ else:
702
+ raise Exception(f"Unexpected error after {max_retries} attempts: {str(e)}")
703
+ while True:
704
+ status_data = requests.get(status_url, headers=headers).json()
705
+ if status_data["data"]["status"] in ["SUCCEEDED", "FAILED"]:
706
+ break
707
+ print("[⏳] Waiting for task run to complete...")
708
+ time.sleep(5)
709
+
710
+ if status_data["data"]["status"] == "SUCCEEDED":
711
+ dataset_id = status_data["data"]["defaultDatasetId"]
712
+
713
+ # Save result to cache
714
+ if use_cache:
715
+ try:
716
+ cache_data = {
717
+ "dataset_id": dataset_id,
718
+ "timestamp": datetime.now().isoformat(),
719
+ "task_id": task_id,
720
+ "platform": platform
721
+ }
722
+
723
+ with open(cache_file, 'w') as f:
724
+ json.dump(cache_data, f)
725
+
726
+ print(f"[💾] Saved result to cache: {cache_file}")
727
+ except Exception as e:
728
+ print(f"[⚠️] Error saving to cache: {str(e)}")
729
+
730
+ return dataset_id
731
+ else:
732
+ raise Exception("Task run failed.")
733
+
734
+ def is_malaysian_content(username, text):
735
+ # Check if content is relevant to the claim
736
+ user_lower = (username or "").lower()
737
+ text_lower = (text or "").lower()
738
+
739
+ # Get the full claim from environment if available
740
+ full_claim = os.environ.get("FULL_CLAIM", "")
741
+ claim_lower = full_claim.lower()
742
+
743
+ # Check if this is about sexual crimes in Kelantan
744
+ kelantan_sexual_crime = "kelantan" in claim_lower and ("rogol" in claim_lower or "sumbang mahram" in claim_lower)
745
+
746
+ if kelantan_sexual_crime:
747
+ # For the specific claim about sexual crimes in Kelantan, use very targeted filtering
748
+ kelantan_keywords = ["kelantan", "kelantanese"]
749
+ crime_keywords = ["rogol", "sumbang mahram", "jenayah seksual", "kes", "polis", "pdrm"]
750
+
751
+ # Must have at least one Kelantan reference AND one crime reference to be relevant
752
+ has_kelantan_ref = any(k in text_lower for k in kelantan_keywords)
753
+ has_crime_ref = any(k in text_lower for k in crime_keywords)
754
+
755
+ if has_kelantan_ref and has_crime_ref:
756
+ return True
757
+
758
+ # Check if username is from a relevant authority
759
+ authority_users = ["polis", "pdrm", "kelantan", "bukit aman", "bernama", "berita"]
760
+ if any(k in user_lower for k in authority_users):
761
+ return True
762
+
763
+ # More restrictive for this specific claim - return False if not matching criteria
764
+ return False
765
+ else:
766
+ # General Malaysian content detection for other claims
767
+ # Keywords for crime-related content
768
+ crime_keywords = [
769
+ "polis", "kelantan", "jenayah", "rogol", "sumbang mahram", "inses",
770
+ "kes", "statistik", "bimbang", "pdrm", "malaysia", "undang-undang",
771
+ "mahkamah", "hukuman", "tangkap", "siasat", "lapor", "mangsa", "suspek",
772
+ "tertuduh", "penderaan", "seksual", "cabul", "gangguan"
773
+ ]
774
+
775
+ # Check if any crime keywords are in the text
776
+ if any(k in text_lower for k in crime_keywords):
777
+ return True
778
+
779
+ # Check if username looks Malaysian
780
+ malaysian_user_indicators = [
781
+ "my", "ms", "malaysia", "officialmy", "rakyat", "malay",
782
+ "dr", "dato", "yb", "ustaz", "cikgu", "polis", "kelantan"
783
+ ]
784
+
785
+ if any(k in user_lower for k in malaysian_user_indicators):
786
+ return True
787
+
788
+ # Default to True for now to maximize data collection, but with better filtering
789
+ return True
790
+
791
+
792
+
793
+ def download_dataset(dataset_id, platform="facebook", timeout=30, max_retries=3, use_cache=True, cache_ttl_hours=24):
794
+ # Check if we have a cached dataset
795
+ cache_file = os.path.join(CACHE_DIR, f"dataset_{dataset_id}.json")
796
+
797
+ if use_cache and os.path.exists(cache_file):
798
+ try:
799
+ with open(cache_file, 'r') as f:
800
+ cache_data = json.load(f)
801
+
802
+ # Check if cache is still valid
803
+ cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
804
+ cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
805
+
806
+ if datetime.now() < cache_expiry:
807
+ print(f"[💾] Using cached dataset {dataset_id} (expires {cache_expiry.isoformat()})")
808
+ return cache_data.get('data', [])
809
+ else:
810
+ print(f"[⏰] Cache expired for dataset {dataset_id}, fetching fresh data")
811
+ except Exception as e:
812
+ print(f"[⚠️] Error reading dataset cache: {str(e)}")
813
+
814
+ token = APIFY_TOKEN_FB if platform == "facebook" else APIFY_TOKEN_TIKTOK
815
+ headers = {
816
+ "Authorization": f"Bearer {token}"
817
+ }
818
+ dataset_url = f"https://api.apify.com/v2/datasets/{dataset_id}/items?clean=true&format=json"
819
+
820
+ # Try multiple times in case of network issues
821
+ for attempt in range(max_retries):
822
+ try:
823
+ print(f"[🔄] Attempt {attempt+1}/{max_retries} to download dataset {dataset_id}...")
824
+ response = requests.get(dataset_url, headers=headers, timeout=timeout)
825
+
826
+ if response.status_code != 200:
827
+ print(f"[❌] Failed to download dataset: {response.text}")
828
+ if attempt < max_retries - 1:
829
+ print("[⏳] Retrying...")
830
+ time.sleep(5) # Wait 5 seconds before retrying
831
+ continue
832
+ raise Exception(f"Dataset download failed after {max_retries} attempts.")
833
+
834
+ data = response.json()
835
+ print(f"[✓] Downloaded {len(data)} items from dataset {dataset_id}")
836
+
837
+ # Save dataset to cache
838
+ if use_cache:
839
+ try:
840
+ cache_data = {
841
+ "data": data,
842
+ "timestamp": datetime.now().isoformat(),
843
+ "dataset_id": dataset_id,
844
+ "platform": platform
845
+ }
846
+
847
+ with open(cache_file, 'w') as f:
848
+ json.dump(cache_data, f)
849
+
850
+ print(f"[💾] Saved dataset to cache: {cache_file}")
851
+ except Exception as e:
852
+ print(f"[⚠️] Error saving dataset to cache: {str(e)}")
853
+
854
+ return data
855
+ except requests.exceptions.Timeout:
856
+ print(f"[❌] Request timed out after {timeout} seconds")
857
+ if attempt < max_retries - 1:
858
+ print("[⏳] Retrying...")
859
+ time.sleep(5) # Wait 5 seconds before retrying
860
+ else:
861
+ raise Exception(f"Dataset download timed out after {max_retries} attempts.")
862
+ except requests.exceptions.ConnectionError:
863
+ print(f"[❌] Connection error")
864
+ if attempt < max_retries - 1:
865
+ print("[⏳] Retrying...")
866
+ time.sleep(5) # Wait 5 seconds before retrying
867
+ else:
868
+ raise Exception(f"Connection error after {max_retries} attempts.")
869
+ except Exception as e:
870
+ print(f"[❌] Unexpected error: {str(e)}")
871
+ if attempt < max_retries - 1:
872
+ print("[⏳] Retrying...")
873
+ time.sleep(5) # Wait 5 seconds before retrying
874
+ else:
875
+ raise Exception(f"Unexpected error after {max_retries} attempts: {str(e)}")
876
+
877
+ # If we get here, all retries failed
878
+ return []
879
+
880
+ def build_boolean_search(keywords):
881
+ """Build an optimized search query for social media platforms"""
882
+ search_terms = []
883
+
884
+ for kw in keywords:
885
+ # If keyword contains spaces (multi-word phrase), wrap in quotes
886
+ if " " in kw:
887
+ search_terms.append(f'"{kw}"')
888
+ else:
889
+ # For single words, don't use quotes to get broader results
890
+ search_terms.append(kw)
891
+
892
+ return " OR ".join(search_terms)
893
+
ai_api/library/config.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ config.py
3
+ Central configuration for the claim analysis system
4
+ """
5
+
6
+ import os
7
+
8
+ # Base directories
9
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
10
+ DATA_DIR = os.path.join(BASE_DIR, "data")
11
+ OUTPUT_DIR = os.path.join(BASE_DIR, "output")
12
+ REPORTS_DIR = os.path.join(BASE_DIR, "reports")
13
+
14
+ # Create directories if they don't exist
15
+ for directory in [DATA_DIR, OUTPUT_DIR, REPORTS_DIR]:
16
+ os.makedirs(directory, exist_ok=True)
17
+
18
+ # API Keys
19
+ GOOGLE_API_KEY = "AIzaSyAnXTkB_0HKXKul3eI-1A56ZQWyjTVj1cQ" # Google Custom Search API key
20
+ GOOGLE_SEARCH_ENGINE_ID = "e7e6c19ee7a984f30" # Add your search engine ID here (you'll need to create this)
21
+
22
+ # Serper.dev API Key (alternative search API)
23
+ SERPER_API_KEY = "e0af440fd71fb125dd38644fe378831c3ed741ca"
24
+
25
+ # SerpApi Google Search API Key
26
+ SERPAPI_API_KEY = "007928aeb7d86d4a85af12728e3534163961837027afb63ec7b89a4624a9f4ac"
27
+
28
+ # Data source settings
29
+ USE_FACEBOOK = False # Disable Facebook data collection
30
+ USE_TIKTOK = True # Enable TikTok data collection
31
+ USE_SERPAPI = True # Enable SerpApi web search
32
+ USE_SERPER = True # Enable Serper.dev web search
33
+ USE_DUCKDUCKGO = False # Disable DuckDuckGo web search
34
+ USE_LOWYAT = True # Enable Lowyat Forum data collection
35
+
36
+ # Number of results to collect from each source
37
+ FACEBOOK_MAX_RESULTS = 100
38
+ TIKTOK_MAX_RESULTS = 10 # Significantly reduced to save Apify costs
39
+ WEB_SEARCH_MAX_RESULTS = 20
40
+ LOWYAT_MAX_THREADS = 20 # Maximum number of Lowyat Forum threads to collect
41
+
42
+ # Lowyat Forum settings
43
+ LOWYAT_SECTIONS = [
44
+ "Kopitiam", "SeriousKopitiam", "News", "Politics", "Malaysia", "Lowyat.NET",
45
+ "Technology", "Computers", "Notebooks", "Smartphones", "Photography", "GamingPC", "GamingConsole",
46
+ "Automotive", "Finance", "Property", "Travel", "Food", "Health", "Sports", "Entertainment",
47
+ "SpecialInterestGarageSales", "JobsCorner", "DigitalMarketplace"
48
+ ] # All available forum sections
49
+
50
+ # Social Media API tokens
51
+ APIFY_TOKEN = "apify_api_INtF6uUT4c6nOStYDYTllxuTBNSbng1IlTTB" # Main Apify API token
52
+ APIFY_TOKEN_FB = APIFY_TOKEN # For Facebook actors
53
+ APIFY_TOKEN_TIKTOK = APIFY_TOKEN # For TikTok actors
54
+
55
+ # Actor task IDs
56
+ # From danek/facebook-search-ppr
57
+ POST_TASK_ID_SEARCH = "l5DitJrtfCyOfrjn6" # Facebook Search PPR (rajamohd/facebook-search-ppr-rm-bernama)
58
+
59
+ # From datavoyantlab/facebook-comments-scraper
60
+ COMMENT_TASK_ID = "qiAp6PQwkyYcLQiyC" # Facebook Comments Scraper (rajamohd/facebook-comments-scraper-task)
61
+
62
+ # From clockworks/free-tiktok-scraper
63
+ TIKTOK_VIDEO_TASK_ID = "rfk0BzRAjuLPbccaZ" # TikTok Data Extractor (devlab/tiktok-data-extractor-bernama2-video)
64
+
65
+ # From clockworks/tiktok-comments-scraper
66
+ TIKTOK_COMMENT_TASK_ID = "rgXeWIhnXKRD5bjGp" # TikTok Comments Scraper (devlab/tiktok-comments-scraper-bernama2)
67
+
68
+ # Apify settings
69
+ USE_COMMENTS = True # Whether to collect comments in addition to posts/videos
70
+
71
+ # Sentiment model
72
+ SENTIMENT_MODEL = "rmtariq/ft-Malay-bert"
73
+
74
+ # Priority indexer settings
75
+ PRIORITY_WEIGHTS = {
76
+ "fact_check_value": 1.5, # Higher weight for factual importance
77
+ "cause_confusion": 1.2, # Medium-high weight for confusion potential
78
+ "cause_chaos": 1.8, # High weight for potential harm
79
+ "affects_government": 1.3, # Medium-high for government impact
80
+ "economic_impact": 1.4, # Medium-high for economic impact
81
+ "law_related": 1.5, # Higher weight for legal implications
82
+ "public_interest": 1.2, # Medium weight for public interest
83
+ "lives_in_danger": 2.0, # Highest weight for safety concerns
84
+ "viral": 1.1, # Lower weight for virality alone
85
+ "urgent": 1.3 # Medium-high for urgency
86
+ }
87
+
88
+ PRIORITY_THRESHOLDS = {
89
+ "high_priority": 7.0,
90
+ "medium_priority": 5.0,
91
+ "low_priority": 3.0
92
+ }
93
+
94
+ # Classification settings
95
+ VERDICT_CATEGORIES = {
96
+ "TIDAK_BENAR": {
97
+ "name": "TIDAK BENAR",
98
+ "description": "Dakwaan ini tidak benar berdasarkan bukti yang ada.",
99
+ "threshold": 7.0,
100
+ "conditions": ["fact_check_value", "law_related"]
101
+ },
102
+ "BERCAMPUR": {
103
+ "name": "BERCAMPUR",
104
+ "description": "Dakwaan ini mengandungi unsur-unsur benar dan tidak benar.",
105
+ "threshold": 5.0,
106
+ "conditions": ["cause_confusion"]
107
+ },
108
+ "BENAR": {
109
+ "name": "BENAR",
110
+ "description": "Dakwaan ini benar berdasarkan bukti yang ada.",
111
+ "threshold": 3.0,
112
+ "conditions": []
113
+ },
114
+ "TIDAK_PASTI": {
115
+ "name": "TIDAK PASTI",
116
+ "description": "Tidak cukup bukti untuk menentukan kebenaran dakwaan ini.",
117
+ "threshold": 0.0,
118
+ "conditions": []
119
+ }
120
+ }
121
+
122
+ # Database settings
123
+ DB_PATH = os.path.join(DATA_DIR, "claims.db")
124
+
125
+ # Malaysian filter settings
126
+ MALAYSIAN_FILTER_THRESHOLD = 0.5 # Confidence threshold for Malaysian content
127
+
128
+ # Report settings
129
+ REPORT_TEMPLATE = None # Path to DOCX template (optional)
130
+ GOOGLE_SEARCH_ENGINE_ID = "e7e6c19ee7a984f30" # Google Search Engine ID
131
+
ai_api/library/devlab_image.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import BlipProcessor, BlipForConditionalGeneration
3
+ from PIL import Image
4
+ from PIL.ExifTags import TAGS
5
+ import json
6
+ import subprocess
7
+ from transformers import CLIPProcessor, CLIPModel
8
+ import torch
9
+ import requests
10
+ import base64
11
+ from selenium import webdriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.common.keys import Keys
14
+ from selenium.webdriver.chrome.service import Service
15
+ from selenium.webdriver.chrome.options import Options
16
+ from webdriver_manager.chrome import ChromeDriverManager
17
+ from bs4 import BeautifulSoup
18
+ import urllib.parse
19
+ import time
20
+ from deepface import DeepFace
21
+ from pymilvus import Collection, connections, CollectionSchema, FieldSchema, DataType
22
+ import numpy as np
23
+ # import faiss
24
+ import os
25
+ import pickle
26
+ import pprint
27
+ import cv2
28
+ from dotenv import load_dotenv
29
+ load_dotenv()
30
+
31
+
32
+ milvus_host = os.getenv("MILVUS_HOST", "localhost") # default localhost
33
+ milvus_port = os.getenv("MILVUS_PORT", "19530") # default 19530
34
+
35
+ connections.connect("default", host=milvus_host, port=int(milvus_port))
36
+
37
+
38
+
39
+
40
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
41
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
42
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
43
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
44
+
45
+
46
+ ES_HOST = "https://localhost:9200"
47
+ ES_USER = "elastic"
48
+ ES_PASS = "qR_BblnAzT-1pOQgFRvZ"
49
+ ES_INDEX = "faces"
50
+
51
+ class DevLabImage :
52
+
53
+ def __init__(self, image_path = None):
54
+ self.image_path = image_path
55
+
56
+ def sanitize_name(self, title, replace ='_'):
57
+ import re
58
+ title = re.sub(r'\s+', ' ', title).strip()
59
+ return re.sub(r'[\\/*?:"<>|]', replace, title)
60
+
61
+
62
+ def extract_text(self, image_path):
63
+ import easyocr
64
+ reader = easyocr.Reader(["en", "ms"]) # English & Malay
65
+ text = reader.readtext(image_path, detail=0)
66
+ return " ".join(text)
67
+
68
+ def extract_text_numpy(self, np_array):
69
+ import easyocr
70
+ reader = easyocr.Reader(["en", "ms"]) # English & Malay
71
+ text = reader.readtext(np_array, detail=0)
72
+ return text
73
+
74
+ # def get_emotions(self):
75
+ # from deepface import DeepFace
76
+ # return DeepFace.analyze(self.image_path, actions=['emotion'])
77
+
78
+ def extract_exif(self, image_path):
79
+ """Extract EXIF metadata from an image"""
80
+
81
+ image = Image.open(image_path)
82
+ exif_data = image._getexif()
83
+
84
+ metadata = {}
85
+ if exif_data:
86
+ for tag, value in exif_data.items():
87
+ tag_name = TAGS.get(tag, tag)
88
+ metadata[tag_name] = value
89
+
90
+ return metadata
91
+
92
+ def extract_metadata_exiftool(self,image_path):
93
+ """Extract IPTC, XMP, and EXIF metadata using ExifTool"""
94
+
95
+ command = ["exiftool", "-j", image_path]
96
+ result = subprocess.run(command, capture_output=True, text=True)
97
+ metadata = json.loads(result.stdout)[0] if result.stdout else {}
98
+
99
+ return metadata
100
+
101
+
102
+ def generate_description_blip(self, image_path):
103
+ """Generate an image description using BLIP"""
104
+
105
+ image = Image.open(image_path).convert("RGB")
106
+ inputs = blip_processor(image, return_tensors="pt")
107
+ out = blip_model.generate(**inputs)
108
+ return blip_processor.decode(out[0], skip_special_tokens=True)
109
+
110
+ def extract_image_features(self,image_path):
111
+ """Extract image embeddings using CLIP"""
112
+
113
+
114
+ image = Image.open(image_path)
115
+ inputs = clip_processor(images=image, return_tensors="pt")
116
+ with torch.no_grad():
117
+ features = clip_model.get_image_features(**inputs)
118
+ return features.squeeze().numpy()
119
+
120
+ # def download_google(self,arguments):
121
+ # """Download from Google"""
122
+ # response = google_images_download.googleimagesdownload()
123
+ # response.download(arguments)
124
+
125
+
126
+ # def download_person(self,person_name):
127
+ # # Define the emotions to search
128
+ # emotions = ["happy", "sad", "angry", "surprised"]
129
+
130
+ # for emotion in emotions:
131
+ # arguments = {
132
+ # "keywords": f"{person_name} {emotion} face",
133
+ # "limit": 10, # Download 10 images per emotion
134
+ # "print_urls": True,
135
+ # "format": "jpg",
136
+ # "output_directory": "people",
137
+ # "image_directory": self.sanitize_name(person_name, ' ') # Save into separate folders per emotion
138
+ # }
139
+ # self.download_google(arguments)
140
+
141
+ def download_image(self, url, folder, image_name):
142
+ """Download and save the image."""
143
+
144
+ try:
145
+ if url.startswith("data:image/"): # Base64 encoded image
146
+ header, encoded_data = url.split(",", 1)
147
+ extension = header.split(";")[0].split("/")[-1] # Extract file type (jpg, png, etc.)
148
+ image_path = os.path.join(folder, f"{image_name}.{extension}")
149
+
150
+ os.makedirs(folder, exist_ok=True)
151
+ with open(image_path, "wb") as file:
152
+ file.write(base64.b64decode(encoded_data))
153
+
154
+ print(f"✅ Base64 image saved: {image_path}")
155
+
156
+ else: # URL download
157
+
158
+ response = requests.get(url, stream=True, timeout=10)
159
+ if response.status_code == 200:
160
+ os.makedirs(folder, exist_ok=True)
161
+ image_path = os.path.join(folder, f"{image_name}.jpg")
162
+ with open(image_path, "wb") as file:
163
+ for chunk in response.iter_content(1024):
164
+ file.write(chunk)
165
+ print(f"✅ Downloaded: {image_path}")
166
+ else:
167
+ print(f"❌ Failed to download: {url}")
168
+ except Exception as e:
169
+ print(f"⚠ Error downloading {url}: {e}")
170
+
171
+ def has_min_img_size(self, tag, min_size=100):
172
+ img = tag.find("img")
173
+ if img and img.has_attr("width") and img.has_attr("height"):
174
+ try:
175
+ width = int(img["width"])
176
+ height = int(img["height"])
177
+ return width >= min_size and height >= min_size
178
+ except ValueError:
179
+ return False
180
+ return False
181
+
182
+ def search_google_images(self, query, num_images=10):
183
+
184
+ # Set up Chrome WebDriver
185
+ options = Options()
186
+ options.binary_location = "/usr/bin/chromium" # important for Docker
187
+ options.add_argument("--headless") # Run in background
188
+ options.add_argument("--no-sandbox")
189
+ options.add_argument("--disable-dev-shm-usage")
190
+ options.add_argument("--disable-gpu")
191
+ options.add_argument("--window-size=1920x1080")
192
+
193
+ # Create driver using installed chromedriver
194
+ driver = webdriver.Chrome(
195
+ service=Service("/usr/bin/chromedriver"), # use system-installed path
196
+ options=options
197
+ )
198
+
199
+
200
+ """Search Google Images and extract image URLs."""
201
+ encoded_query = urllib.parse.quote(query)
202
+ search_url = f"https://www.google.com/search?q={encoded_query}&tbm=isch&sclient=img"
203
+
204
+ print(f"🔍 Searching for: {query}")
205
+
206
+ driver.get(search_url)
207
+ time.sleep(2) # Wait for page to load
208
+
209
+ list_items = driver.find_elements(By.CSS_SELECTOR, "div[role='listitem']")
210
+ list_items[1].click()
211
+ time.sleep(3) # Wait for page to load
212
+
213
+ # Scroll to load more images
214
+ for _ in range(3):
215
+ driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
216
+ time.sleep(2)
217
+
218
+ # Extract image URLs
219
+ soup = BeautifulSoup(driver.page_source, "html.parser")
220
+
221
+ # target_div = soup.find("div", {"id":query})
222
+
223
+ # # Extract all <img> tags inside the div
224
+ # if target_div:
225
+ # images = target_div.find_all("img")
226
+ # # images = soup.find_all("img")
227
+ # else:
228
+ # images = soup.select("g-img img")
229
+ # g_imgs = [g for g in soup.find_all("g-img") if g.get("style") not in ("width:12px;height:12px", "width:46px;height:46px")]
230
+ g_imgs = [g for g in soup.find_all("g-img") if self.has_min_img_size(g)]
231
+
232
+
233
+ # g_imgs = soup.select("g-img")
234
+
235
+ # print(g_imgs)
236
+ # driver.quit()
237
+ # return
238
+
239
+ image_urls = []
240
+ for gimg in g_imgs:
241
+ if len(image_urls) >= num_images:
242
+ break
243
+ img = gimg.find('img')
244
+ src = img.get("src")
245
+
246
+ if src.startswith("data:image/"):
247
+ mime_type = src.split(";")[0].split(":")[1] # Extract MIME type
248
+ file_extension = mime_type.split("/")[-1] # Extract file extension
249
+ else:
250
+ file_extension = src.split(".")[-1].split("?")[0].lower() # Extract file extension from URL
251
+
252
+ # Skip GIFs
253
+ if file_extension == "gif":
254
+ continue
255
+ # if not src or not src.startswith("data:image/"):
256
+ # continue
257
+
258
+ # mime_type = src.split(";")[0].split(":")[1]
259
+ # file_extension = mime_type.split("/")[-1]
260
+ # if file_extension == "gif":
261
+ # continue
262
+
263
+ image_urls.append(src)
264
+
265
+ print(f"✅ Found {len(image_urls)} images for {query}")
266
+ driver.quit()
267
+ return image_urls
268
+
269
+ def download_person_images(self, person_name, tags = None):
270
+ """Download images for a person with different emotions."""
271
+ emotions = ["happy", "sad", "angry", "surprised"]
272
+ foldername = self.sanitize_name(person_name, ' ')
273
+ # filename = self.sanitize_name(person_name)
274
+ # for emotion in emotions:
275
+ # folder = f"people/{foldername}"
276
+ # image_urls = self.search_google_images(person_name, emotion)
277
+
278
+ # for i, url in enumerate(image_urls):
279
+ # self.download_image(url, folder, f"{emotion}{i+1}")
280
+
281
+ folder = f"people/{foldername}"
282
+ # query = f"{person_name} headshot OR close-up HD -group -friends -couple -family -crowd -far -selfie {tags}"
283
+ # query = f"'{person_name}' headshot OR close-up HD medium size {tags}"
284
+ # query = f"'{person_name}' official portrait large size"
285
+ query = f"'{person_name}' portrait {tags}"
286
+
287
+ image_urls = self.search_google_images(query, 5)
288
+ for i, url in enumerate(image_urls):
289
+ self.download_image(url, folder, f"{i+1}")
290
+
291
+ return foldername
292
+
293
+ def extract_face(self, person, tags):
294
+
295
+ try:
296
+ collection = Collection("faces")
297
+ collection.load() # Try loading the collection to check if it exists
298
+ print("Collection 'faces' already exists.")
299
+ except Exception as e:
300
+ # If collection doesn't exist, create it
301
+ print(f"Creating collection: {e}")
302
+ fields = [
303
+ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
304
+ FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
305
+ FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=255),
306
+ FieldSchema(name="short_description", dtype=DataType.VARCHAR, max_length=255),
307
+ FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=5000),
308
+ ]
309
+ schema = CollectionSchema(fields, description="Face embeddings")
310
+ collection = Collection(name="faces", schema=schema)
311
+ collection.create_index(field_name="embedding", index_params={"metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 32, "efConstruction": 512}})
312
+ collection.load()
313
+
314
+ dataset_path = "people/"
315
+ person_path = os.path.join(dataset_path, person)
316
+ print(person_path)
317
+
318
+ if not os.path.isdir(person_path):
319
+ return
320
+
321
+ image_files = [f for f in os.listdir(person_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
322
+
323
+ for img in image_files:
324
+ img_path = os.path.join(person_path, img)
325
+ try:
326
+ embedding = self.extract_embedding(image_path=img_path)
327
+ if embedding is not None:
328
+ emb = np.array(embedding, dtype=np.float32)
329
+ if emb.size > 0:
330
+ collection.insert([[emb], [person], [tags], ['']])
331
+ print(f"{person} registered")
332
+ else:
333
+ print(f"No embedding found for {img_path}")
334
+
335
+ except Exception as e:
336
+ print(f"Could not process {img_path}: {str(e)}")
337
+
338
+ def register_person(self, person_name, tags = ''):
339
+ """Register a person with their images."""
340
+ folder = self.download_person_images(person_name, tags)
341
+ self.extract_face(folder,tags)
342
+
343
+ def query_embedding(self,query_embedding, top_k=5):
344
+
345
+ # Load the collection
346
+ try:
347
+ collection = Collection("faces")
348
+ collection.load() # Try loading the collection to check if it exists
349
+ print("Collection 'faces' already exists.")
350
+ except Exception as e:
351
+ # If collection doesn't exist, create it
352
+ print(f"Creating collection: {e}")
353
+ fields = [
354
+ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
355
+ FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
356
+ FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=255),
357
+ FieldSchema(name="short_description", dtype=DataType.VARCHAR, max_length=255),
358
+ FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=5000),
359
+ ]
360
+ schema = CollectionSchema(fields, description="Face embeddings")
361
+ collection = Collection(name="faces", schema=schema)
362
+ collection.create_index(field_name="embedding", index_params={"metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 32, "efConstruction": 512}})
363
+ collection.load()
364
+
365
+ # query_embedding = self.extract_embedding(query_image_path)
366
+ # if query_embedding is None:
367
+ # print("No embedding extracted for the query image.")
368
+ # return None
369
+
370
+ # Convert the query embedding to a numpy array
371
+ query_emb = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
372
+ params = {"metric_type": "COSINE", "params": {"efTopK": top_k}}
373
+
374
+ search_results = collection.search(query_emb, "embedding", output_fields=["id", "name","short_description","description"], param=params, limit=top_k)
375
+
376
+ return search_results
377
+
378
+
379
+ def extract_embedding(self, image_path):
380
+ try:
381
+ faces = DeepFace.represent(image_path, model_name="Facenet", enforce_detection=False)
382
+
383
+ if faces:
384
+ return faces[0]["embedding"]
385
+ else:
386
+ return None
387
+
388
+ except Exception as e:
389
+ print(f"Failed on {image_path}: {e}")
390
+ return None
391
+
392
+ def detect_faces(self):
393
+
394
+ image = cv2.imread(self.image_path)
395
+
396
+ face_embeddings = DeepFace.represent(self.image_path, model_name="Facenet", enforce_detection=False)
397
+
398
+ if not face_embeddings: # No faces detected
399
+ return "❌ No faces detected in the image."
400
+
401
+ recognized_faces = {}
402
+
403
+ for face_data in face_embeddings:
404
+ # print(face_data)
405
+ face_embedding = np.array(face_data["embedding"]).tolist()
406
+
407
+ face_location = face_data["facial_area"]
408
+ # face_location = face_data["region"]
409
+
410
+ x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
411
+ clipped_face = image[y:y+h, x:x+w]
412
+
413
+ # The search query using cosine similarity
414
+ query = {
415
+ "size": 1,
416
+ "query": {
417
+ "script_score": {
418
+ "query": {"match_all": {}}, # Match all documents
419
+ "script": {
420
+ "source": "(cosineSimilarity(params.query_vector, 'embedding') + 1) / 2", # Cosine similarity formula
421
+ "params": {
422
+ "query_vector": face_embedding # The face embedding you want to compare
423
+ }
424
+ }
425
+ }
426
+ }
427
+ }
428
+
429
+ # Perform the POST request to Elasticsearch
430
+ response = requests.post(
431
+ f"{ES_HOST}/{ES_INDEX}/_search",
432
+ headers={"Content-Type": "application/json"},
433
+ auth=(ES_USER, ES_PASS),
434
+ json=query,
435
+ verify=False # Disable SSL verification for testing (in production, use SSL)
436
+ )
437
+
438
+ # Check if the request was successful
439
+ if response.status_code == 200:
440
+ # return response.json()
441
+ results = response.json()
442
+ # pprint.pprint(results)
443
+ if results['hits']['hits']:
444
+ name = results['hits']['hits'][0]['_source']['name']
445
+ recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {"name": name, "image": clipped_face, "score": results['hits']['hits'][0]['_score']}
446
+
447
+
448
+ return recognized_faces
449
+
450
+ def delete_person(self, person):
451
+ import requests
452
+ import json
453
+
454
+ delete_query = {
455
+ "query": {
456
+ "term": {
457
+ "name": person # Field to match and its value
458
+ }
459
+ }
460
+ }
461
+
462
+ # Send the DELETE request to Elasticsearch
463
+ response = requests.post(
464
+ f"{ES_HOST}/{ES_INDEX}/_delete_by_query",
465
+ auth=(ES_USER, ES_PASS),
466
+ headers={"Content-Type": "application/json"},
467
+ data=json.dumps(delete_query),
468
+ verify=False # Disable SSL verification for testing (use True in production)
469
+ )
470
+
471
+ # Check if the request was successful
472
+ if response.status_code == 200:
473
+ print(f"Documents with name = {person} deleted successfully.")
474
+
475
+
476
+ def analyze(self):
477
+ analysis = DeepFace.analyze(self.image_path, actions= ['age', 'gender', 'race', 'emotion'])
478
+ return analysis[0]
479
+
480
+ def reverse_search(self, image_path):
481
+ from reverse_image_search import reverse_image_search
482
+
483
+ return reverse_image_search(image_path, engines=["google", "yandex"])
484
+
485
+
486
+
487
+
ai_api/library/lowyat_crawler.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lowyat_crawler.py
2
+ # Crawler for Lowyat Forum data
3
+
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import pandas as pd
7
+ import time
8
+ import random
9
+ import os
10
+ import json
11
+ import hashlib
12
+ from datetime import datetime, timedelta
13
+ import re
14
+
15
+ # Create cache directory
16
+ CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")
17
+ os.makedirs(CACHE_DIR, exist_ok=True)
18
+
19
+ # Lowyat Forum base URL
20
+ LOWYAT_BASE_URL = "https://forum.lowyat.net"
21
+
22
+ # Forum section IDs
23
+ FORUM_SECTIONS = {
24
+ # Main Discussion Forums
25
+ "Kopitiam": "16", # General discussion
26
+ "SeriousKopitiam": "506", # Serious discussions
27
+ "News": "17", # News discussions
28
+ "Politics": "507", # Political discussions
29
+ "Malaysia": "508", # Malaysia-specific topics
30
+ "Lowyat.NET": "18", # Lowyat.NET related discussions
31
+
32
+ # Technology Forums
33
+ "Technology": "19", # Technology discussions
34
+ "Computers": "20", # Computer discussions
35
+ "Notebooks": "32", # Laptop discussions
36
+ "Smartphones": "22", # Smartphone discussions
37
+ "Photography": "29", # Photography discussions
38
+ "GamingPC": "503", # PC Gaming
39
+ "GamingConsole": "504", # Console Gaming
40
+
41
+ # Lifestyle Forums
42
+ "Automotive": "23", # Car and motorcycle discussions
43
+ "Finance": "24", # Financial discussions
44
+ "Property": "25", # Property discussions
45
+ "Travel": "26", # Travel discussions
46
+ "Food": "27", # Food discussions
47
+ "Health": "28", # Health discussions
48
+ "Sports": "30", # Sports discussions
49
+ "Entertainment": "31", # Entertainment discussions
50
+
51
+ # Marketplace Forums
52
+ "SpecialInterestGarageSales": "21", # Buy and sell
53
+ "JobsCorner": "33", # Job listings
54
+ "DigitalMarketplace": "34" # Digital marketplace
55
+ }
56
+
57
+ def get_forum_section_url(section_name):
58
+ """Get the URL for a forum section"""
59
+ if section_name in FORUM_SECTIONS:
60
+ section_id = FORUM_SECTIONS[section_name]
61
+ return f"{LOWYAT_BASE_URL}/forums/{section_id}"
62
+ else:
63
+ # Assume it's a custom section name, try to search for it
64
+ return f"{LOWYAT_BASE_URL}/search/forums?q={section_name}"
65
+
66
+ def clean_text(text):
67
+ """Clean text by removing extra whitespace"""
68
+ if not text:
69
+ return ""
70
+ return re.sub(r'\s+', ' ', text).strip()
71
+
72
+ def extract_date(date_str):
73
+ """Extract and standardize date from Lowyat Forum date string"""
74
+ try:
75
+ # Handle various date formats
76
+ if "Today" in date_str or "Yesterday" in date_str:
77
+ # For relative dates, convert to actual date
78
+ today = datetime.now().date()
79
+ if "Yesterday" in date_str:
80
+ date = today - timedelta(days=1)
81
+ else:
82
+ date = today
83
+
84
+ # Extract time if available
85
+ time_match = re.search(r'(\d+:\d+\s*[AP]M)', date_str)
86
+ if time_match:
87
+ time_str = time_match.group(1)
88
+ return f"{date.isoformat()} {time_str}"
89
+ return date.isoformat()
90
+ else:
91
+ # Try to parse standard date formats
92
+ date_patterns = [
93
+ r'(\d{1,2}-\d{1,2}-\d{4})', # DD-MM-YYYY
94
+ r'(\d{1,2}/\d{1,2}/\d{4})', # DD/MM/YYYY
95
+ r'(\w+ \d{1,2}, \d{4})' # Month DD, YYYY
96
+ ]
97
+
98
+ for pattern in date_patterns:
99
+ match = re.search(pattern, date_str)
100
+ if match:
101
+ return match.group(1)
102
+
103
+ # If no pattern matches, return the original string
104
+ return date_str
105
+ except Exception as e:
106
+ print(f"Error parsing date '{date_str}': {str(e)}")
107
+ return date_str
108
+
109
+ def search_lowyat_forum(keywords, sections=None, max_pages=3, max_threads=20, use_cache=True, cache_ttl_hours=24, verbose=True, use_mock_data=True):
110
+ """
111
+ Search Lowyat Forum for threads matching keywords
112
+
113
+ Args:
114
+ keywords (list): List of keywords to search for
115
+ sections (list): List of forum sections to search in (default: ["Kopitiam", "SeriousKopitiam", "Finance"])
116
+ max_pages (int): Maximum number of search result pages to process
117
+ max_threads (int): Maximum number of threads to process
118
+ use_cache (bool): Whether to use cached results
119
+ cache_ttl_hours (int): How long to keep cached results valid
120
+ verbose (bool): Whether to print verbose output
121
+ use_mock_data (bool): Whether to use mock data if real data cannot be retrieved
122
+
123
+ Returns:
124
+ list: List of thread data dictionaries
125
+ """
126
+ if sections is None:
127
+ sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
128
+
129
+ # Generate cache key
130
+ cache_key = f"lowyat_{'_'.join(keywords)}_{'_'.join(sections)}_{max_pages}_{max_threads}"
131
+ cache_hash = hashlib.md5(cache_key.encode()).hexdigest()
132
+ cache_file = os.path.join(CACHE_DIR, f"lowyat_{cache_hash}.json")
133
+
134
+ # Check cache
135
+ if use_cache and os.path.exists(cache_file):
136
+ try:
137
+ with open(cache_file, 'r') as f:
138
+ cache_data = json.load(f)
139
+
140
+ # Check if cache is still valid
141
+ cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
142
+ cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
143
+
144
+ if datetime.now() < cache_expiry:
145
+ print(f"[💾] Using cached Lowyat Forum results (expires {cache_expiry.isoformat()})")
146
+ return cache_data.get('threads', [])
147
+ else:
148
+ print(f"[⏰] Cache expired for Lowyat Forum search, fetching fresh data")
149
+ except Exception as e:
150
+ print(f"[⚠️] Error reading Lowyat Forum cache: {str(e)}")
151
+
152
+ all_threads = []
153
+ threads_processed = 0
154
+ cloudflare_detected = False
155
+
156
+ # Process each section
157
+ for section in sections:
158
+ if threads_processed >= max_threads:
159
+ break
160
+
161
+ print(f"[🔍] Searching Lowyat Forum section: {section}")
162
+ section_url = get_forum_section_url(section)
163
+
164
+ # For each keyword, search the section
165
+ for keyword in keywords:
166
+ if threads_processed >= max_threads:
167
+ break
168
+
169
+ print(f"[🔍] Searching for keyword: {keyword}")
170
+
171
+ # Construct search URL
172
+ if "search" in section_url:
173
+ # Already a search URL, add the keyword
174
+ search_url = f"{section_url}+{keyword.replace(' ', '+')}"
175
+ else:
176
+ # Regular section URL, add search parameter
177
+ search_url = f"{section_url}/search?q={keyword.replace(' ', '+')}"
178
+
179
+ # Process search result pages
180
+ for page in range(1, max_pages + 1):
181
+ if threads_processed >= max_threads:
182
+ break
183
+
184
+ page_url = f"{search_url}&page={page}" if page > 1 else search_url
185
+ print(f"[🔍] Processing page {page}: {page_url}")
186
+
187
+ try:
188
+ # Add random delay to avoid rate limiting
189
+ time.sleep(random.uniform(1, 3))
190
+
191
+ # Get search results page with enhanced headers
192
+ headers = {
193
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
194
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
195
+ 'Accept-Language': 'en-US,en;q=0.5',
196
+ 'Accept-Encoding': 'gzip, deflate, br',
197
+ 'Connection': 'keep-alive',
198
+ 'Upgrade-Insecure-Requests': '1',
199
+ 'Cache-Control': 'max-age=0'
200
+ }
201
+
202
+ response = requests.get(page_url, headers=headers, timeout=10)
203
+
204
+ if response.status_code != 200:
205
+ print(f"[❌] Failed to get search results page: {response.status_code}")
206
+ break
207
+
208
+ if verbose:
209
+ print(f"[🔍] Response received: {len(response.text)} bytes")
210
+
211
+ # Check for Cloudflare protection
212
+ if "Cloudflare" in response.text and "challenge" in response.text:
213
+ print(f"[⚠️] Cloudflare protection detected. Cannot access forum content directly.")
214
+ cloudflare_detected = True
215
+ break
216
+
217
+ # Parse search results
218
+ soup = BeautifulSoup(response.text, 'html.parser')
219
+ thread_elements = soup.select('.structItem--thread')
220
+
221
+ if not thread_elements:
222
+ print(f"[⚠️] No threads found on page {page} for keyword '{keyword}' in section '{section}'")
223
+
224
+ if verbose:
225
+ # Print a snippet of the response to help debug
226
+ print(f"[🔍] Response snippet: {response.text[:500]}...")
227
+
228
+ # Check if we're getting a search results page at all
229
+ search_title = soup.select_one('title')
230
+ if search_title:
231
+ print(f"[🔍] Page title: {search_title.get_text()}")
232
+
233
+ # Check if there's a message about no results
234
+ no_results = soup.select_one('.block-row--message')
235
+ if no_results:
236
+ print(f"[🔍] Message: {no_results.get_text()}")
237
+ break
238
+
239
+ # Process each thread
240
+ for thread_elem in thread_elements:
241
+ if threads_processed >= max_threads:
242
+ break
243
+
244
+ try:
245
+ # Extract thread data
246
+ title_elem = thread_elem.select_one('.structItem-title')
247
+ if not title_elem:
248
+ continue
249
+
250
+ title = clean_text(title_elem.get_text())
251
+ thread_url = LOWYAT_BASE_URL + title_elem.find('a')['href']
252
+
253
+ # Extract author
254
+ author_elem = thread_elem.select_one('.structItem-minor')
255
+ author = clean_text(author_elem.get_text()) if author_elem else "Unknown"
256
+
257
+ # Extract date
258
+ date_elem = thread_elem.select_one('.structItem-startDate time')
259
+ date_str = date_elem.get('datetime') if date_elem else "Unknown"
260
+ date = extract_date(date_str)
261
+
262
+ # Extract preview text if available
263
+ preview_elem = thread_elem.select_one('.structItem-excerpt')
264
+ preview = clean_text(preview_elem.get_text()) if preview_elem else ""
265
+
266
+ # Get thread content
267
+ thread_data = get_thread_content(thread_url)
268
+
269
+ # Combine data
270
+ thread_info = {
271
+ "platform": "lowyat_forum",
272
+ "section": section,
273
+ "title": title,
274
+ "author": author,
275
+ "date": date,
276
+ "url": thread_url,
277
+ "preview": preview,
278
+ "content": thread_data.get("content", ""),
279
+ "replies": thread_data.get("replies", [])
280
+ }
281
+
282
+ all_threads.append(thread_info)
283
+ threads_processed += 1
284
+ print(f"[✓] Processed thread: {title} ({threads_processed}/{max_threads})")
285
+
286
+ except Exception as e:
287
+ print(f"[❌] Error processing thread: {str(e)}")
288
+
289
+ # Check if there are more pages
290
+ next_page = soup.select_one('.pageNav-jump--next')
291
+ if not next_page:
292
+ print(f"[⚠️] No more pages for keyword '{keyword}' in section '{section}'")
293
+ break
294
+
295
+ except Exception as e:
296
+ print(f"[❌] Error processing page {page}: {str(e)}")
297
+ break
298
+
299
+ # If no threads found and Cloudflare detected, use mock data if enabled
300
+ if not all_threads and cloudflare_detected and use_mock_data:
301
+ print(f"[ℹ️] Using mock data for Lowyat Forum due to Cloudflare protection")
302
+ all_threads = generate_mock_lowyat_data(keywords, sections, max_threads)
303
+
304
+ # Save results to cache
305
+ if use_cache:
306
+ try:
307
+ cache_data = {
308
+ "threads": all_threads,
309
+ "timestamp": datetime.now().isoformat(),
310
+ "keywords": keywords,
311
+ "sections": sections
312
+ }
313
+
314
+ with open(cache_file, 'w') as f:
315
+ json.dump(cache_data, f)
316
+
317
+ print(f"[💾] Saved Lowyat Forum results to cache: {cache_file}")
318
+ except Exception as e:
319
+ print(f"[⚠️] Error saving Lowyat Forum results to cache: {str(e)}")
320
+
321
+ return all_threads
322
+
323
+
324
+ def generate_mock_lowyat_data(keywords, sections, max_threads):
325
+ """
326
+ Generate mock data for Lowyat Forum when real data cannot be retrieved
327
+
328
+ Args:
329
+ keywords (list): List of keywords used for the search
330
+ sections (list): List of forum sections that were searched
331
+ max_threads (int): Maximum number of threads to generate
332
+
333
+ Returns:
334
+ list: List of mock thread data dictionaries
335
+ """
336
+ print(f"[💻] Generating mock data for keywords: {', '.join(keywords)}")
337
+
338
+ # Create a list to store mock threads
339
+ mock_threads = []
340
+
341
+ # Define some common Malaysian usernames
342
+ usernames = [
343
+ "MalaysianGuy", "KLite", "JohorianPride", "PenangFoodie", "SarawakExplorer",
344
+ "MalaccaHistory", "SabahAdventure", "IPohBoy", "KuchingCat", "TerengganuDiver",
345
+ "PerakMan", "KedahPadi", "NegeriS9", "PahangForest", "MelakaCendol"
346
+ ]
347
+
348
+ # Define some common topics based on keywords
349
+ topics_by_keyword = {
350
+ "cukai": [
351
+ "Cukai baharu akan diperkenalkan tahun depan?",
352
+ "Pendapat tentang cukai keuntungan modal",
353
+ "Cara menjimatkan cukai pendapatan",
354
+ "Cukai jualan dan perkhidmatan (SST) vs GST",
355
+ "Adakah cukai kereta import akan dikurangkan?"
356
+ ],
357
+ "minyak sawit": [
358
+ "Harga minyak sawit dijangka naik bulan depan",
359
+ "EU ban minyak sawit: Kesan kepada Malaysia",
360
+ "Industri minyak sawit dan isu kelestarian",
361
+ "Minyak sawit vs minyak zaitun: Mana lebih sihat?",
362
+ "Eksport minyak sawit Malaysia meningkat 15%"
363
+ ],
364
+ "kerajaan": [
365
+ "Kerajaan akan umum inisiatif baharu untuk sektor perumahan",
366
+ "Polisi kerajaan untuk industri teknologi",
367
+ "Kerajaan perkenal subsidi baharu untuk petani",
368
+ "Pandangan tentang prestasi kerajaan semasa",
369
+ "Kerajaan lancar program bantuan PKS"
370
+ ],
371
+ "ekonomi": [
372
+ "Ekonomi Malaysia dijangka pulih pada Q3",
373
+ "Kesan inflasi kepada ekonomi tempatan",
374
+ "Ringgit vs USD: Analisis semasa",
375
+ "Sektor pelancongan menyumbang kepada pemulihan ekonomi",
376
+ "Bagaimana keadaan ekonomi mempengaruhi pasaran hartanah?"
377
+ ]
378
+ }
379
+
380
+ # Default topics if no matching keywords
381
+ default_topics = [
382
+ "Pandangan tentang isu semasa di Malaysia",
383
+ "Perbincangan tentang kenaikan harga barang",
384
+ "Cadangan tempat makan sedap di KL",
385
+ "Perkongsian pengalaman kerja dari rumah",
386
+ "Tips melabur dalam pasaran saham Malaysia"
387
+ ]
388
+
389
+ # Generate threads for each section
390
+ threads_per_section = max(1, max_threads // len(sections))
391
+
392
+ for section in sections:
393
+ # Find relevant topics based on keywords
394
+ relevant_topics = []
395
+ for keyword in keywords:
396
+ keyword_lower = keyword.lower()
397
+ # Check if we have predefined topics for this keyword
398
+ for k, topics in topics_by_keyword.items():
399
+ if k in keyword_lower or keyword_lower in k:
400
+ relevant_topics.extend(topics)
401
+
402
+ # If no relevant topics found, use default topics
403
+ if not relevant_topics:
404
+ relevant_topics = default_topics
405
+
406
+ # Generate threads for this section
407
+ for i in range(threads_per_section):
408
+ if len(mock_threads) >= max_threads:
409
+ break
410
+
411
+ # Select a topic
412
+ topic = random.choice(relevant_topics)
413
+
414
+ # Generate a date within the last month
415
+ days_ago = random.randint(1, 30)
416
+ thread_date = (datetime.now() - timedelta(days=days_ago)).isoformat()
417
+
418
+ # Generate content
419
+ content = f"Ini adalah perbincangan tentang {topic}. "
420
+ content += f"Saya ingin berkongsi pendapat dan mendapatkan maklum balas daripada ahli forum. "
421
+ content += f"Apakah pandangan anda tentang perkara ini?"
422
+
423
+ # Generate replies
424
+ num_replies = random.randint(1, 5)
425
+ replies = []
426
+
427
+ for j in range(num_replies):
428
+ reply_days_ago = random.randint(0, days_ago)
429
+ reply_date = (datetime.now() - timedelta(days=reply_days_ago)).isoformat()
430
+
431
+ reply_username = random.choice(usernames)
432
+ reply_content = f"Saya bersetuju dengan pendapat anda tentang {topic}. "
433
+ reply_content += f"Ini adalah pandangan saya..."
434
+
435
+ replies.append({
436
+ "author": reply_username,
437
+ "date": reply_date,
438
+ "content": reply_content
439
+ })
440
+
441
+ # Create thread info
442
+ thread_info = {
443
+ "platform": "lowyat_forum",
444
+ "section": section,
445
+ "title": topic,
446
+ "author": random.choice(usernames),
447
+ "date": thread_date,
448
+ "url": f"https://forum.lowyat.net/topic/{random.randint(100000, 999999)}",
449
+ "preview": content[:100] + "...",
450
+ "content": content,
451
+ "replies": replies
452
+ }
453
+
454
+ mock_threads.append(thread_info)
455
+ print(f"[💻] Generated mock thread: {topic} in {section}")
456
+
457
+ return mock_threads
458
+
459
+ def get_thread_content(thread_url, max_posts=10):
460
+ """
461
+ Get content from a Lowyat Forum thread
462
+
463
+ Args:
464
+ thread_url (str): URL of the thread
465
+ max_posts (int): Maximum number of posts to extract
466
+
467
+ Returns:
468
+ dict: Thread content and replies
469
+ """
470
+ try:
471
+ # Add random delay to avoid rate limiting
472
+ time.sleep(random.uniform(1, 3))
473
+
474
+ # Get thread page
475
+ response = requests.get(thread_url, headers={
476
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
477
+ })
478
+
479
+ if response.status_code != 200:
480
+ print(f"[❌] Failed to get thread page: {response.status_code}")
481
+ return {"content": "", "replies": []}
482
+
483
+ # Parse thread page
484
+ soup = BeautifulSoup(response.text, 'html.parser')
485
+
486
+ # Get main post content
487
+ main_post = soup.select_one('.message--post')
488
+ content = ""
489
+ if main_post:
490
+ content_elem = main_post.select_one('.message-body .bbWrapper')
491
+ content = clean_text(content_elem.get_text()) if content_elem else ""
492
+
493
+ # Get replies
494
+ replies = []
495
+ reply_elements = soup.select('.message--post')[1:max_posts+1] # Skip the first post (main content)
496
+
497
+ for reply_elem in reply_elements:
498
+ try:
499
+ # Extract reply author
500
+ author_elem = reply_elem.select_one('.message-name')
501
+ author = clean_text(author_elem.get_text()) if author_elem else "Unknown"
502
+
503
+ # Extract reply date
504
+ date_elem = reply_elem.select_one('.message-attribution-main time')
505
+ date_str = date_elem.get('datetime') if date_elem else "Unknown"
506
+ date = extract_date(date_str)
507
+
508
+ # Extract reply content
509
+ content_elem = reply_elem.select_one('.message-body .bbWrapper')
510
+ reply_content = clean_text(content_elem.get_text()) if content_elem else ""
511
+
512
+ replies.append({
513
+ "author": author,
514
+ "date": date,
515
+ "content": reply_content
516
+ })
517
+ except Exception as e:
518
+ print(f"[❌] Error processing reply: {str(e)}")
519
+
520
+ return {
521
+ "content": content,
522
+ "replies": replies
523
+ }
524
+
525
+ except Exception as e:
526
+ print(f"[❌] Error getting thread content: {str(e)}")
527
+ return {"content": "", "replies": []}
528
+
529
+ def convert_to_dataframe(threads):
530
+ """
531
+ Convert Lowyat Forum thread data to a DataFrame compatible with the claim analysis system
532
+
533
+ Args:
534
+ threads (list): List of thread data dictionaries
535
+
536
+ Returns:
537
+ pandas.DataFrame: DataFrame with standardized columns
538
+ """
539
+ records = []
540
+
541
+ for thread in threads:
542
+ # Add the main thread as a record
543
+ main_record = {
544
+ "platform": "LowyatForum", # Changed to standardized label
545
+ "date": thread.get("date", ""),
546
+ "username": thread.get("author", ""),
547
+ "post_text": thread.get("title", "") + " " + thread.get("content", ""),
548
+ "post_url": thread.get("url", ""),
549
+ "likes": 0, # Lowyat doesn't expose like counts in the HTML
550
+ "shares": 0, # No share counts
551
+ "comments_count": len(thread.get("replies", [])),
552
+ "comment_text": "",
553
+ "combined_text": thread.get("title", "") + " " + thread.get("content", "")
554
+ }
555
+ records.append(main_record)
556
+
557
+ # Add each reply as a separate record
558
+ for reply in thread.get("replies", []):
559
+ reply_record = {
560
+ "platform": "LowyatForum_Comment", # Changed to standardized label
561
+ "date": reply.get("date", ""),
562
+ "username": reply.get("author", ""),
563
+ "post_text": "",
564
+ "post_url": thread.get("url", ""),
565
+ "likes": 0,
566
+ "shares": 0,
567
+ "comments_count": 0,
568
+ "comment_text": reply.get("content", ""),
569
+ "combined_text": reply.get("content", "")
570
+ }
571
+ records.append(reply_record)
572
+
573
+ # Create DataFrame
574
+ if records:
575
+ df = pd.DataFrame(records)
576
+ return df
577
+ else:
578
+ # Return empty DataFrame with correct columns
579
+ return pd.DataFrame(columns=[
580
+ "platform", "date", "username", "post_text", "post_url",
581
+ "likes", "shares", "comments_count", "comment_text", "combined_text"
582
+ ])
583
+
584
+ def run(keywords, sections=None, max_threads=20, output_path=None, full_claim=None, verbose=True, use_mock_data=True):
585
+ """
586
+ Run the Lowyat Forum crawler and save results
587
+
588
+ Args:
589
+ keywords (list): List of keywords to search for
590
+ sections (list): List of forum sections to search in
591
+ max_threads (int): Maximum number of threads to process
592
+ output_path (str): Path to save results CSV
593
+ full_claim (str): The full claim text for more targeted searching
594
+ verbose (bool): Whether to print verbose output
595
+ use_mock_data (bool): Whether to use mock data if real data cannot be retrieved
596
+
597
+ Returns:
598
+ pandas.DataFrame: DataFrame with crawled data
599
+ """
600
+ print(f"[🔍] Starting Lowyat Forum crawler for keywords: {', '.join(keywords)}")
601
+
602
+ # Check if this is a crime-related claim about Kelantan
603
+ crime_related = any(kw in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"] for kw in keywords)
604
+ kelantan_related = any("kelantan" in kw.lower() for kw in keywords)
605
+
606
+ # Use the full claim directly if available for crime-related claims in Kelantan
607
+ if full_claim and crime_related and kelantan_related:
608
+ print(f"[🔍] Using full claim for Lowyat Forum search: {full_claim}")
609
+
610
+ # Use the full claim as a single search term
611
+ keywords = [full_claim]
612
+
613
+ # Also add these specialized keywords for better coverage
614
+ specialized_keywords = [
615
+ "polis kelantan",
616
+ "kes rogol kelantan",
617
+ "sumbang mahram",
618
+ "jenayah seksual"
619
+ ]
620
+
621
+ # Add specialized keywords to the search
622
+ keywords.extend(specialized_keywords)
623
+ print(f"[🔍] Using keywords: {', '.join(keywords)}")
624
+ # Use more targeted keywords for crime-related claims in Kelantan (if no full claim)
625
+ elif crime_related and kelantan_related:
626
+ print("[🔍] Detected crime-related claim about Kelantan, using specialized keywords")
627
+ keywords = [
628
+ "polis kelantan",
629
+ "kes rogol kelantan",
630
+ "sumbang mahram",
631
+ "jenayah seksual"
632
+ ]
633
+ # Add context-specific keywords for other types of claims
634
+ elif full_claim:
635
+ # Check for economic/financial claims
636
+ if any(term in full_claim.lower() for term in ["ekonomi", "kewangan", "cukai", "subsidi", "harga"]):
637
+ print("[🔍] Detected economic/financial claim, adding relevant keywords")
638
+ econ_keywords = ["ekonomi malaysia", "kewangan", "cukai", "subsidi", "harga"]
639
+ keywords.extend([k for k in econ_keywords if k not in keywords])
640
+
641
+ # Check for political claims
642
+ elif any(term in full_claim.lower() for term in ["kerajaan", "politik", "perdana menteri", "kabinet", "parlimen"]):
643
+ print("[🔍] Detected political claim, adding relevant keywords")
644
+ pol_keywords = ["kerajaan", "politik malaysia", "dasar", "kabinet"]
645
+ keywords.extend([k for k in pol_keywords if k not in keywords])
646
+
647
+ # Set default sections if not provided
648
+ if sections is None:
649
+ sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
650
+
651
+ # Validate sections against available forum sections
652
+ valid_sections = [section for section in sections if section in FORUM_SECTIONS]
653
+ if not valid_sections:
654
+ print("[⚠️] No valid forum sections provided. Using default sections.")
655
+ valid_sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
656
+
657
+ # If sections were invalid, inform the user
658
+ if len(valid_sections) != len(sections):
659
+ print(f"[⚠️] Some sections were invalid. Using: {', '.join(valid_sections)}")
660
+
661
+ # For crime-related topics, prioritize SeriousKopitiam
662
+ if crime_related and "SeriousKopitiam" in valid_sections:
663
+ # Move SeriousKopitiam to the front of the list
664
+ valid_sections.remove("SeriousKopitiam")
665
+ valid_sections.insert(0, "SeriousKopitiam")
666
+
667
+ # For economic topics, prioritize Finance
668
+ elif any(term in "".join(keywords).lower() for term in ["ekonomi", "kewangan", "cukai", "subsidi", "harga"]) and "Finance" in valid_sections:
669
+ valid_sections.remove("Finance")
670
+ valid_sections.insert(0, "Finance")
671
+
672
+ # For political topics, prioritize Politics
673
+ elif any(term in "".join(keywords).lower() for term in ["kerajaan", "politik", "perdana menteri", "kabinet", "parlimen"]) and "Politics" in valid_sections:
674
+ valid_sections.remove("Politics")
675
+ valid_sections.insert(0, "Politics")
676
+
677
+ # Search forum with enhanced options
678
+ threads = search_lowyat_forum(
679
+ keywords,
680
+ sections=valid_sections,
681
+ max_threads=max_threads,
682
+ verbose=verbose,
683
+ use_mock_data=use_mock_data
684
+ )
685
+ print(f"[✓] Found {len(threads)} threads on Lowyat Forum")
686
+
687
+ # Convert to DataFrame
688
+ df = convert_to_dataframe(threads)
689
+ print(f"[✓] Converted to {len(df)} records")
690
+
691
+ # Save to CSV if output path provided
692
+ if output_path and len(df) > 0:
693
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
694
+ df.to_csv(output_path, index=False)
695
+ print(f"[💾] Saved Lowyat Forum data to {output_path}")
696
+ elif output_path:
697
+ # Create an empty CSV file with the correct columns
698
+ empty_df = pd.DataFrame(columns=[
699
+ "platform", "date", "username", "post_text", "post_url",
700
+ "likes", "shares", "comments_count", "comment_text", "combined_text"
701
+ ])
702
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
703
+ empty_df.to_csv(output_path, index=False)
704
+ print(f"[💾] Saved empty Lowyat Forum data file to {output_path}")
705
+
706
+ return df
707
+
708
+ # Test the crawler if run directly
709
+ if __name__ == "__main__":
710
+ test_keywords = ["cukai minyak sawit", "palm oil tax"]
711
+ test_sections = ["Kopitiam", "Finance"]
712
+
713
+ df = run_lowyat_crawler(test_keywords, sections=test_sections, max_threads=10)
714
+ print(df.head())
ai_api/library/priority_indexer.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # priority_indexer.py
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+ import re
6
+ from datetime import datetime
7
+
8
+ def load_agency_keywords(filepath=None):
9
+ """
10
+ Load keywords for agency detection or use default keywords if file not found
11
+ """
12
+ # Define default agency keywords if file not provided or not found
13
+ default_keywords = {
14
+ # Government-related keywords
15
+ "government": [
16
+ "kerajaan", "menteri", "perdana menteri", "kementerian", "jabatan",
17
+ "agensi", "dasar", "parlimen", "dewan rakyat", "dewan negara",
18
+ "dun", "pejabat", "keselamatan negara", "atm", "polis",
19
+ "kdn", "hasil", "sop", "ancaman", "pentadbiran", "kabinet",
20
+ "politik", "ahli parlimen", "wakil rakyat", "adun", "pemimpin",
21
+ "ketua menteri", "menteri besar", "exco", "majlis", "pihak berkuasa",
22
+ "pbt", "majlis perbandaran", "majlis bandaraya", "dewan bandaraya"
23
+ ],
24
+
25
+ # Economic keywords
26
+ "economic": [
27
+ "ekonomi", "kewangan", "bank", "cukai", "subsidi", "harga", "kos",
28
+ "perbelanjaan", "pendapatan", "gaji", "dividen", "saham", "pasaran",
29
+ "inflasi", "deflasi", "krisis", "kemelesetan", "pertumbuhan", "gdp",
30
+ "kdnk", "pelaburan", "pelabur", "perniagaan", "syarikat", "industri",
31
+ "sektor", "perdagangan", "import", "eksport", "mata wang", "ringgit",
32
+ "dolar", "hutang", "pinjaman", "faedah", "untung", "rugi", "bayaran",
33
+ "fi", "yuran", "perbelanjaan", "pendapatan", "bonus", "elaun",
34
+ "insentif", "bantuan", "sumbangan", "derma", "zakat", "duti",
35
+ "levi", "caj", "jualan", "belian", "pembelian", "perolehan",
36
+ "tender", "kontrak", "projek", "pembangunan", "infrastruktur",
37
+ "pembinaan", "hartanah", "rumah", "kediaman", "komersial",
38
+ "tanah", "saiz", "keluasan", "murah", "mahal", "berpatutan",
39
+ "mampu", "tidak mampu", "bekalan", "stok", "inventori",
40
+ "simpanan", "rizab", "aset", "liabiliti", "kredit", "debit",
41
+ "ansuran", "keuntungan", "kerugian", "defisit", "surplus",
42
+ "lebihan", "kekurangan", "kenaikan", "penurunan", "peningkatan",
43
+ "pengurangan", "pemulihan", "pembaikan"
44
+ ],
45
+
46
+ # Law-related keywords
47
+ "law": [
48
+ "undang-undang", "perundangan", "akta", "enakmen", "ordinan",
49
+ "peraturan", "perlembagaan", "mahkamah", "hakim", "peguam",
50
+ "pendakwa", "pendakwaan", "pertuduhan", "dakwaan", "saman",
51
+ "waran", "tangkap", "tahan", "reman", "jamin", "ikat jamin",
52
+ "denda", "hukuman", "penjara", "polis", "balai", "laporan",
53
+ "aduan", "siasatan", "siasat", "jenayah", "sivil", "kes",
54
+ "fail", "bicara", "perbicaraan", "prosiding", "rayuan",
55
+ "petisyen", "pindaan", "bon", "jaminan", "saksi", "keterangan",
56
+ "bukti", "forensik", "peguambela", "peguamcara", "pendakwa raya",
57
+ "majistret", "ketua hakim", "ketua hakim negara", "hakim besar",
58
+ "mahkamah tinggi", "mahkamah rayuan", "mahkamah persekutuan",
59
+ "mahkamah rendah", "mahkamah majistret", "mahkamah sesyen",
60
+ "mahkamah syariah", "pdrm", "ibu pejabat polis", "ketua polis",
61
+ "pegawai polis", "anggota polis", "konstabel", "koperal",
62
+ "sarjan", "inspektor", "superintendan", "komisioner", "sprm",
63
+ "suruhanjaya pencegahan rasuah", "rasuah", "korupsi",
64
+ "salah guna kuasa", "penyelewengan", "pecah amanah",
65
+ "pengubahan wang haram"
66
+ ],
67
+
68
+ # Danger-related keywords
69
+ "danger": [
70
+ "bahaya", "merbahaya", "risiko", "ancaman", "bencana", "malapetaka",
71
+ "tragedi", "musibah", "kemalangan", "nahas", "kecelakaan", "kecederaan",
72
+ "kematian", "korban", "mangsa", "kemusnahan", "kerosakan", "kerugian",
73
+ "kehilangan", "kecurian", "rompakan", "samun", "ragut", "pecah",
74
+ "pecah rumah", "pecah masuk", "curi", "culik", "bunuh", "bunuh diri",
75
+ "mati", "cedera", "parah", "kritikal", "koma", "luka", "patah",
76
+ "retak", "lebam", "bengkak", "darah", "pendarahan", "kecemasan",
77
+ "ambulans", "hospital", "klinik", "doktor", "ubat", "dadah",
78
+ "narkotik", "ganja", "heroin", "kokain", "syabu", "pil kuda",
79
+ "ekstasi", "ketamin", "morfin", "ketagihan", "penagih", "pengedar",
80
+ "sindiket", "kartel", "mafia", "gangster", "kongsi gelap", "geng",
81
+ "kumpulan jenayah", "penjenayah", "penjahat", "pesalah", "banduan",
82
+ "tahanan", "suspek", "tertuduh", "terdakwa", "senjata", "pistol",
83
+ "revolver", "senapang", "rifle", "shotgun", "bom", "granat",
84
+ "peluru", "kelongsong", "senjata api", "senjata tajam", "pisau",
85
+ "parang", "kapak", "keris", "pedang", "racun", "toksin", "kimia",
86
+ "biologi", "nuklear", "radiasi", "sinaran", "letupan", "ledakan",
87
+ "kebakaran", "api", "nyalaan", "bara", "asap", "hangus", "terbakar",
88
+ "banjir", "bah", "limpahan", "hujan", "ribut", "taufan", "siklon",
89
+ "hurikan", "tornado", "puting beliung", "angin kencang", "kilat",
90
+ "petir", "guruh", "guntur", "halilintar", "tanah runtuh", "gelinciran tanah",
91
+ "runtuhan", "runtuh", "jatuh", "roboh", "rebah", "tumbang", "gempa",
92
+ "gempa bumi", "tsunami", "ombak besar", "gelombang tinggi", "kemarau",
93
+ "kekeringan", "perang", "pertempuran", "pergaduhan", "perkelahian",
94
+ "rusuhan", "kekacauan", "huru-hara", "keganasan", "kekerasan",
95
+ "keselamatan", "keselamatan negara", "keselamatan awam", "kanser",
96
+ "barah", "tumor", "penyakit", "wabak", "epidemik", "pandemik",
97
+ "jangkitan", "virus", "bakteria", "nyawa", "terancam", "maut"
98
+ ]
99
+ }
100
+
101
+ # Try to load from file if provided
102
+ if filepath and os.path.exists(filepath):
103
+ try:
104
+ df = pd.read_csv(filepath)
105
+ if 'keyword' in df.columns and 'category' in df.columns:
106
+ # Group keywords by category
107
+ keywords = {}
108
+ for category in df['category'].unique():
109
+ keywords[category] = df[df['category'] == category]['keyword'].tolist()
110
+ return keywords
111
+ else:
112
+ print(f"[⚠️] Warning: Required columns not found in {filepath}. Using default keywords.")
113
+ return default_keywords
114
+ except Exception as e:
115
+ print(f"[⚠️] Error loading agency keywords from {filepath}: {e}")
116
+ return default_keywords
117
+ else:
118
+ if filepath:
119
+ print(f"[ℹ️] Agency keywords file not found. Using default keywords.")
120
+ return default_keywords
121
+
122
+ def analyze_text_content(df, keywords_dict):
123
+ """
124
+ Analyze text content in the dataframe to find keywords
125
+ Returns a dictionary of found keywords by category
126
+ """
127
+ found_keywords = {category: [] for category in keywords_dict.keys()}
128
+
129
+ # Combine all text columns
130
+ text_columns = ['post_text', 'comment_text', 'title', 'snippet', 'combined_text']
131
+ all_text = ""
132
+
133
+ for col in text_columns:
134
+ if col in df.columns:
135
+ all_text += " " + " ".join(df[col].fillna("").astype(str))
136
+
137
+ all_text = all_text.lower()
138
+
139
+ # Search for keywords in the combined text
140
+ for category, keywords in keywords_dict.items():
141
+ for keyword in keywords:
142
+ if keyword.lower() in all_text:
143
+ found_keywords[category].append(keyword)
144
+
145
+ # Remove duplicates and limit to top 5 per category
146
+ for category in found_keywords:
147
+ found_keywords[category] = list(set(found_keywords[category]))[:5]
148
+
149
+ return found_keywords
150
+
151
+ def calculate_priority_score(flags):
152
+ """Calculate priority score based on flags"""
153
+ # Base weights for different flags
154
+ weights = {
155
+ "fact_check_value": 1.0,
156
+ "cause_confusion": 1.5,
157
+ "cause_chaos": 1.8,
158
+ "affects_government": 1.0,
159
+ "economic_impact": 0.8,
160
+ "law_related": 0.8,
161
+ "public_interest": 1.2,
162
+ "lives_in_danger": 1.5,
163
+ "viral": 1.0,
164
+ "urgent": 2.0
165
+ }
166
+
167
+ # Calculate weighted score
168
+ score = 0
169
+ for flag, value in flags.items():
170
+ if flag in weights and value == 1:
171
+ score += weights[flag]
172
+
173
+ # Normalize to 0-10 scale
174
+ max_possible_score = sum(weights.values())
175
+ normalized_score = (score / max_possible_score) * 10
176
+
177
+ # Cap at 10
178
+ return min(normalized_score, 10.0)
179
+
180
+ def get_priority_level(score):
181
+ """Get priority level based on score"""
182
+ if score >= 8.0:
183
+ return "TINGGI"
184
+ elif score >= 5.0:
185
+ return "SEDERHANA"
186
+ else:
187
+ return "RENDAH"
188
+
189
+ def run(sentiment_csv, agencies_csv=None, output_path=None, claim=None, claim_id=None, keywords=None):
190
+ """
191
+ Run priority indexing on sentiment data
192
+
193
+ Args:
194
+ sentiment_csv (str): Path to sentiment CSV file
195
+ agencies_csv (str, optional): Path to agencies CSV file
196
+ output_path (str, optional): Path to output JSON file
197
+ claim (str, optional): The claim text
198
+ claim_id (str, optional): Unique identifier for the claim
199
+ keywords (list, optional): List of keywords
200
+
201
+ Returns:
202
+ dict: Priority report data
203
+ """
204
+ print(f"[🔍] Loading sentiment data from: {sentiment_csv}")
205
+
206
+ try:
207
+ df = pd.read_csv(sentiment_csv)
208
+ except Exception as e:
209
+ print(f"[❌] Error reading sentiment data: {e}")
210
+ return None
211
+
212
+ # Load agency keywords
213
+ agency_keywords = load_agency_keywords(agencies_csv)
214
+
215
+ # Initialize flags
216
+ flags = {
217
+ "fact_check_value": 0,
218
+ "cause_confusion": 0,
219
+ "cause_chaos": 0,
220
+ "affects_government": 0,
221
+ "economic_impact": 0,
222
+ "law_related": 0,
223
+ "public_interest": 0,
224
+ "lives_in_danger": 0,
225
+ "viral": 0,
226
+ "urgent": 0
227
+ }
228
+
229
+ # Calculate sentiment counts
230
+ sentiment_counts = df['sentiment'].value_counts().to_dict()
231
+
232
+ # Convert numeric sentiments to text
233
+ sentiment_map = {0: "neutral", 1: "positive", 2: "negative"}
234
+ text_counts = {}
235
+
236
+ for k, v in sentiment_counts.items():
237
+ if k in sentiment_map:
238
+ text_counts[sentiment_map[k]] = v
239
+ else:
240
+ text_counts[k] = v
241
+
242
+ # Get total records
243
+ total_records = len(df)
244
+
245
+ # Calculate engagement metrics
246
+ total_likes = df['likes'].sum() if 'likes' in df.columns else 0
247
+ total_shares = df['shares'].sum() if 'shares' in df.columns else 0
248
+ total_comments = df['comments'].sum() if 'comments' in df.columns else 0
249
+ total_views = df['views'].sum() if 'views' in df.columns else 0
250
+
251
+ total_engagement = total_likes + total_shares + total_comments + total_views
252
+
253
+ # Check fact_check_value flag (based on engagement)
254
+ # Rule: High engagement indicates need for fact checking
255
+ if total_engagement > 10000:
256
+ flags["fact_check_value"] = 1
257
+ print(f"[📊] Flag: fact_check_value triggered (Total engagement: {total_engagement})")
258
+
259
+ # Check sentiment-based flags
260
+ pos = text_counts.get("positive", 0)
261
+ neg = text_counts.get("negative", 0)
262
+ neu = text_counts.get("neutral", 0)
263
+
264
+ total_sentiment = pos + neg + neu
265
+ if total_sentiment > 0:
266
+ pos_ratio = pos / total_sentiment
267
+ neg_ratio = neg / total_sentiment
268
+ neu_ratio = neu / total_sentiment
269
+
270
+ # Rule: cause_confusion if positive = negative OR neutral is high
271
+ if (abs(pos_ratio - neg_ratio) < 0.2 and pos_ratio > 0.2 and neg_ratio > 0.2) or (neu_ratio > 0.7):
272
+ flags["cause_confusion"] = 1
273
+ print(f"[📊] Flag: cause_confusion triggered (Pos: {pos_ratio:.2f}, Neg: {neg_ratio:.2f}, Neu: {neu_ratio:.2f})")
274
+
275
+ # Rule: cause_chaos if negative sentiment is high
276
+ if neg_ratio > 0.4:
277
+ flags["cause_chaos"] = 1
278
+ print(f"[📊] Flag: cause_chaos triggered (Negative: {neg_ratio:.2f})")
279
+
280
+ # Analyze text content for keywords
281
+ found_keywords = analyze_text_content(df, agency_keywords)
282
+
283
+ # Check government-related flag
284
+ # Rule: Contains government-related keywords
285
+ if found_keywords["government"]:
286
+ flags["affects_government"] = 1
287
+ print(f"[📊] Flag: affects_government triggered (Gov terms: {', '.join(found_keywords['government'])})")
288
+
289
+ # Check economic impact flag
290
+ # Rule: Contains economic-related keywords
291
+ if found_keywords["economic"]:
292
+ flags["economic_impact"] = 1
293
+ print(f"[📊] Flag: economic_impact triggered (Economic terms: {', '.join(found_keywords['economic'])})")
294
+
295
+ # Check law-related flag
296
+ # Rule: Contains law-related keywords
297
+ if found_keywords["law"]:
298
+ flags["law_related"] = 1
299
+ print(f"[📊] Flag: law_related triggered (Law terms: {', '.join(found_keywords['law'])})")
300
+
301
+ # Check public interest flag
302
+ # Rule: High comments and shares indicate public interest
303
+ if (total_comments + total_shares) > 1000:
304
+ flags["public_interest"] = 1
305
+ print(f"[📊] Flag: public_interest triggered (Comments + Shares: {total_comments + total_shares})")
306
+
307
+ # Check danger-related flag
308
+ # Rule: Contains danger-related keywords
309
+ if found_keywords["danger"]:
310
+ flags["lives_in_danger"] = 1
311
+ print(f"[📊] Flag: lives_in_danger triggered (Danger terms: {', '.join(found_keywords['danger'])})")
312
+
313
+ # Check viral flag
314
+ # Rule: High shares indicate virality
315
+ if total_shares > 1000:
316
+ flags["viral"] = 1
317
+ print(f"[📊] Flag: viral triggered (Total shares: {total_shares})")
318
+
319
+ # Check urgent flag
320
+ # Rule: If 5 or more flags are triggered, it's urgent
321
+ flags_triggered = sum(flags.values())
322
+ if flags_triggered >= 5:
323
+ flags["urgent"] = 1
324
+ print(f"[📊] Flag: urgent triggered ({flags_triggered} flags triggered)")
325
+
326
+ # Calculate priority score
327
+ priority_score = calculate_priority_score(flags)
328
+ priority_level = get_priority_level(priority_score)
329
+
330
+ # Prepare report data
331
+ report_data = {
332
+ "priority_flags": flags,
333
+ "priority_score": priority_score,
334
+ "priority_level": priority_level,
335
+ "sentiment_counts": text_counts,
336
+ "total_records": total_records,
337
+ "engagement": {
338
+ "likes": int(total_likes),
339
+ "shares": int(total_shares),
340
+ "comments": int(total_comments),
341
+ "views": int(total_views),
342
+ "total": int(total_engagement)
343
+ },
344
+ "found_keywords": found_keywords,
345
+ "claim": claim,
346
+ "keywords": keywords,
347
+ "timestamp": datetime.now().isoformat()
348
+ }
349
+
350
+ # Ensure output directory exists
351
+ if not output_path:
352
+ output_path = os.path.join("reports", os.path.basename(sentiment_csv).replace("_sentiment.csv", "_priority.json"))
353
+
354
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
355
+ with open(output_path, 'w') as f:
356
+ json.dump(report_data, f, indent=4)
357
+
358
+ print(f"[📊] Priority index saved to {output_path}")
359
+ print(f"[📊] Priority score: {priority_score:.2f}/10 ({priority_level})")
360
+ return report_data
ai_api/library/sentiment_analyzer.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sentiment_analyzer.py
2
+ # Simple sentiment analyzer that doesn't require PyTorch
3
+
4
+ import pandas as pd
5
+ import re
6
+ import random
7
+ import os
8
+
9
+ def simple_sentiment_analysis(text):
10
+ """
11
+ A very simple rule-based sentiment analyzer for demonstration purposes.
12
+ Returns a sentiment label (neutral, positive, negative) and confidence score.
13
+ """
14
+ if not text or len(text.strip()) < 15:
15
+ return "neutral", 0.5
16
+
17
+ # Convert to lowercase
18
+ text = text.lower()
19
+
20
+ # Define positive and negative word lists (Malay and English)
21
+ positive_words = [
22
+ "baik", "bagus", "hebat", "cantik", "indah", "suka", "gembira", "senang",
23
+ "setuju", "betul", "benar", "berkesan", "berjaya", "cemerlang", "positif",
24
+ "good", "great", "excellent", "amazing", "wonderful", "happy", "like", "love",
25
+ "agree", "correct", "true", "effective", "successful", "positive"
26
+ ]
27
+
28
+ negative_words = [
29
+ "buruk", "teruk", "hodoh", "benci", "marah", "sedih", "kecewa", "susah",
30
+ "tidak setuju", "salah", "palsu", "gagal", "negatif", "masalah", "bahaya",
31
+ "bad", "terrible", "ugly", "hate", "angry", "sad", "disappointed", "difficult",
32
+ "disagree", "wrong", "false", "fail", "negative", "problem", "dangerous"
33
+ ]
34
+
35
+ # Count positive and negative words
36
+ positive_count = sum(1 for word in positive_words if re.search(r'\b' + re.escape(word) + r'\b', text))
37
+ negative_count = sum(1 for word in negative_words if re.search(r'\b' + re.escape(word) + r'\b', text))
38
+
39
+ # Determine sentiment
40
+ if positive_count > negative_count:
41
+ sentiment = "positive"
42
+ confidence = 0.5 + min(0.5, (positive_count - negative_count) / 10)
43
+ elif negative_count > positive_count:
44
+ sentiment = "negative"
45
+ confidence = 0.5 + min(0.5, (negative_count - positive_count) / 10)
46
+ else:
47
+ sentiment = "neutral"
48
+ confidence = 0.5
49
+
50
+ return sentiment, round(confidence, 4)
51
+
52
+ def run(csv_path, sentiment_output_path=None):
53
+ """
54
+ Runs sentiment analysis on combined comment + post text from the input CSV.
55
+ Saves the result (with sentiment + confidence columns) to a new CSV.
56
+ """
57
+ print(f"[📄] Reading dataset: {csv_path}")
58
+ df = pd.read_csv(csv_path)
59
+
60
+ # Combine comment and post text into a single field
61
+ df['combined_text'] = df['comment_text'].fillna('') + ". " + df['post_text'].fillna('')
62
+ df['combined_text'] = df['combined_text'].str.strip()
63
+
64
+ sentiments = []
65
+ confidences = []
66
+
67
+ print("[🔍] Running simple sentiment classification...")
68
+ for text in df['combined_text']:
69
+ sentiment, confidence = simple_sentiment_analysis(text)
70
+ sentiments.append(sentiment)
71
+ confidences.append(confidence)
72
+
73
+ # Add results to DataFrame
74
+ df['sentiment'] = sentiments
75
+ df['confidence'] = confidences
76
+
77
+ # Map sentiments to numeric values for compatibility with the rest of the system
78
+ sentiment_map = {
79
+ "neutral": 0,
80
+ "positive": 1,
81
+ "negative": 2
82
+ }
83
+ df['sentiment_value'] = df['sentiment'].map(sentiment_map)
84
+
85
+ # Determine the output path dynamically if not provided
86
+ if not sentiment_output_path:
87
+ sentiment_output_path = csv_path.replace(".csv", "_sentiment.csv")
88
+
89
+ df.to_csv(sentiment_output_path, index=False)
90
+ print(f"[💾] Sentiment analysis completed. Output saved to: {sentiment_output_path}")
91
+
ai_api/library/simple_keyword_extraction.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # simple_keyword_extraction.py
2
+ # Simple keyword extraction for the claim analysis system
3
+
4
+ import re
5
+ from collections import Counter
6
+
7
+ # Define Malay stopwords
8
+ MALAY_STOPWORDS = [
9
+ "ada", "adalah", "adanya", "adapun", "agak", "agaknya", "agar", "akan", "akankah", "akhir",
10
+ "akhiri", "akhirnya", "aku", "akulah", "amat", "amatlah", "anda", "andalah", "antar", "antara",
11
+ "antaranya", "apa", "apaan", "apabila", "apakah", "apalagi", "apatah", "artinya", "asal", "asalkan",
12
+ "atas", "atau", "ataukah", "ataupun", "awal", "awalnya", "bagai", "bagaikan", "bagaimana", "bagaimanakah",
13
+ "bagaimanapun", "bagi", "bagian", "bahkan", "bahwa", "bahwasanya", "baik", "bakal", "bakalan", "balik",
14
+ "banyak", "bapak", "baru", "bawah", "beberapa", "begini", "beginian", "beginikah", "beginilah", "begitu",
15
+ "begitukah", "begitulah", "begitupun", "bekerja", "belakang", "belakangan", "belum", "belumlah", "benar",
16
+ "benarkah", "benarlah", "berada", "berakhir", "berakhirlah", "berakhirnya", "berapa", "berapakah", "berapalah",
17
+ "berapapun", "berarti", "berawal", "berbagai", "berdatangan", "beri", "berikan", "berikut", "berikutnya",
18
+ "berjumlah", "berkali-kali", "berkata", "berkehendak", "berkeinginan", "berkenaan", "berlainan", "berlalu",
19
+ "berlangsung", "berlebihan", "bermacam", "bermacam-macam", "bermaksud", "bermula", "bersama", "bersama-sama",
20
+ "bersiap", "bersiap-siap", "bertanya", "bertanya-tanya", "berturut", "berturut-turut", "bertutur", "berujar",
21
+ "berupa", "besar", "betul", "betulkah", "biasa", "biasanya", "bila", "bilakah", "bisa", "bisakah", "boleh",
22
+ "bolehkah", "bolehlah", "buat", "bukan", "bukankah", "bukanlah", "bukannya", "bulan", "bung", "cara", "caranya",
23
+ "cukup", "cukupkah", "cukuplah", "cuma", "dahulu", "dalam", "dan", "dapat", "dari", "daripada", "datang",
24
+ "dekat", "demi", "demikian", "demikianlah", "dengan", "depan", "di", "dia", "diakhiri", "diakhirinya", "dialah",
25
+ "diantara", "diantaranya", "diberi", "diberikan", "diberikannya", "dibuat", "dibuatnya", "didapat", "didatangkan",
26
+ "digunakan", "diibaratkan", "diibaratkannya", "diingat", "diingatkan", "diinginkan", "dijawab", "dijelaskan",
27
+ "dijelaskannya", "dikarenakan", "dikatakan", "dikatakannya", "dikerjakan", "diketahui", "diketahuinya", "dikira",
28
+ "dilakukan", "dilalui", "dilihat", "dimaksud", "dimaksudkan", "dimaksudkannya", "dimaksudnya", "diminta",
29
+ "dimintai", "dimisalkan", "dimulai", "dimulailah", "dimulainya", "dimungkinkan", "dini", "dipastikan",
30
+ "diperbuat", "diperbuatnya", "dipergunakan", "diperkirakan", "diperlihatkan", "diperlukan", "diperlukannya",
31
+ "dipersoalkan", "dipertanyakan", "dipunyai", "diri", "dirinya", "disampaikan", "disebut", "disebutkan",
32
+ "disebutkannya", "disini", "disinilah", "ditambahkan", "ditandaskan", "ditanya", "ditanyai", "ditanyakan",
33
+ "ditegaskan", "ditujukan", "ditunjuk", "ditunjuki", "ditunjukkan", "ditunjukkannya", "ditunjuknya", "dituturkan",
34
+ "dituturkannya", "diucapkan", "diucapkannya", "diungkapkan", "dong", "dua", "dulu", "empat", "enggak", "enggaknya",
35
+ "entah", "entahlah", "guna", "gunakan", "hal", "hampir", "hanya", "hanyalah", "hari", "harus", "haruslah",
36
+ "harusnya", "hendak", "hendaklah", "hendaknya", "hingga", "ia", "ialah", "ibarat", "ibaratkan", "ibaratnya",
37
+ "ibu", "ikut", "ingat", "ingat-ingat", "ingin", "inginkah", "inginkan", "ini", "inikah", "inilah", "itu",
38
+ "itukah", "itulah", "jadi", "jadilah", "jadinya", "jangan", "jangankan", "janganlah", "jauh", "jawab",
39
+ "jawaban", "jawabnya", "jelas", "jelaskan", "jelaslah", "jelasnya", "jika", "jikalau", "juga", "jumlah",
40
+ "jumlahnya", "justru", "kala", "kalau", "kalaulah", "kalaupun", "kalian", "kami", "kamilah", "kamu", "kamulah",
41
+ "kan", "kapan", "kapankah", "kapanpun", "karena", "karenanya", "kasus", "kata", "katakan", "katakanlah",
42
+ "katanya", "ke", "keadaan", "kebetulan", "kecil", "kedua", "keduanya", "keinginan", "kelamaan", "kelihatan",
43
+ "kelihatannya", "kelima", "keluar", "kembali", "kemudian", "kemungkinan", "kemungkinannya", "kenapa", "kepada",
44
+ "kepadanya", "kesamaan", "keseluruhan", "keseluruhannya", "keterlaluan", "ketika", "khususnya", "kini", "kinilah",
45
+ "kira", "kira-kira", "kiranya", "kita", "kitalah", "kok", "kurang", "lagi", "lagian", "lah", "lain", "lainnya",
46
+ "lalu", "lama", "lamanya", "lanjut", "lanjutnya", "lebih", "lewat", "lima", "luar", "macam", "maka", "makanya",
47
+ "makin", "malah", "malahan", "mampu", "mampukah", "mana", "manakala", "manalagi", "masa", "masalah", "masalahnya",
48
+ "masih", "masihkah", "masing", "masing-masing", "mau", "maupun", "melainkan", "melakukan", "melalui", "melihat",
49
+ "melihatnya", "memang", "memastikan", "memberi", "memberikan", "membuat", "memerlukan", "memihak", "meminta",
50
+ "memintakan", "memisalkan", "memperbuat", "mempergunakan", "memperkirakan", "memperlihatkan", "mempersiapkan",
51
+ "mempersoalkan", "mempertanyakan", "mempunyai", "memulai", "memungkinkan", "menaiki", "menambahkan", "menandaskan",
52
+ "menanti", "menanti-nanti", "menantikan", "menanya", "menanyai", "menanyakan", "mendapat", "mendapatkan",
53
+ "mendatang", "mendatangi", "mendatangkan", "menegaskan", "mengakhiri", "mengapa", "mengatakan", "mengatakannya",
54
+ "mengenai", "mengerjakan", "mengetahui", "menggunakan", "menghendaki", "mengibaratkan", "mengibaratkannya",
55
+ "mengingat", "mengingatkan", "menginginkan", "mengira", "mengucapkan", "mengucapkannya", "mengungkapkan",
56
+ "menjadi", "menjawab", "menjelaskan", "menuju", "menunjuk", "menunjuki", "menunjukkan", "menunjuknya", "menurut",
57
+ "menuturkan", "menyampaikan", "menyangkut", "menyatakan", "menyebutkan", "menyeluruh", "menyiapkan", "merasa",
58
+ "mereka", "merekalah", "merupakan", "meski", "meskipun", "meyakini", "meyakinkan", "minta", "mirip", "misal",
59
+ "misalkan", "misalnya", "mula", "mulai", "mulailah", "mulanya", "mungkin", "mungkinkah", "nah", "naik", "namun",
60
+ "nanti", "nantinya", "nyaris", "nyatanya", "oleh", "olehnya", "pada", "padahal", "padanya", "pak", "paling",
61
+ "panjang", "pantas", "para", "pasti", "pastilah", "penting", "pentingnya", "per", "percuma", "perlu", "perlukah",
62
+ "perlunya", "pernah", "persoalan", "pertama", "pertama-tama", "pertanyaan", "pertanyakan", "pihak", "pihaknya",
63
+ "pukul", "pula", "pun", "punya", "rasa", "rasanya", "rata", "rupanya", "saat", "saatnya", "saja", "sajalah",
64
+ "saling", "sama", "sama-sama", "sambil", "sampai", "sampai-sampai", "sampaikan", "sana", "sangat", "sangatlah",
65
+ "satu", "saya", "sayalah", "se", "sebab", "sebabnya", "sebagai", "sebagaimana", "sebagainya", "sebagian",
66
+ "sebaik", "sebaik-baiknya", "sebaiknya", "sebaliknya", "sebanyak", "sebegini", "sebegitu", "sebelum", "sebelumnya",
67
+ "sebenarnya", "seberapa", "sebesar", "sebetulnya", "sebisanya", "sebuah", "sebut", "sebutlah", "sebutnya",
68
+ "secara", "secukupnya", "sedang", "sedangkan", "sedemikian", "sedikit", "sedikitnya", "seenaknya", "segala",
69
+ "segalanya", "segera", "seharusnya", "sehingga", "seingat", "sejak", "sejauh", "sejenak", "sejumlah", "sekadar",
70
+ "sekadarnya", "sekali", "sekali-kali", "sekalian", "sekaligus", "sekalipun", "sekarang", "sekarang", "sekecil",
71
+ "seketika", "sekiranya", "sekitar", "sekitarnya", "sekurang-kurangnya", "sekurangnya", "sela", "selain", "selaku",
72
+ "selalu", "selama", "selama-lamanya", "selamanya", "selanjutnya", "seluruh", "seluruhnya", "semacam", "semakin",
73
+ "semampu", "semampunya", "semasa", "semasih", "semata", "semata-mata", "semaunya", "sementara", "semisal",
74
+ "semisalnya", "sempat", "semua", "semuanya", "semula", "sendiri", "sendirian", "sendirinya", "seolah",
75
+ "seolah-olah", "seorang", "sepanjang", "sepantasnya", "sepantasnyalah", "seperlunya", "seperti", "sepertinya",
76
+ "sepihak", "sering", "seringnya", "serta", "serupa", "sesaat", "sesama", "sesampai", "sesegera", "sesekali",
77
+ "seseorang", "sesuatu", "sesuatunya", "sesudah", "sesudahnya", "setelah", "setempat", "setengah", "seterusnya",
78
+ "setiap", "setiba", "setibanya", "setidak-tidaknya", "setidaknya", "setinggi", "seusai", "sewaktu", "siap",
79
+ "siapa", "siapakah", "siapapun", "sini", "sinilah", "soal", "soalnya", "suatu", "sudah", "sudahkah", "sudahlah",
80
+ "supaya", "tadi", "tadinya", "tahu", "tahun", "tak", "tambah", "tambahnya", "tampak", "tampaknya", "tandas",
81
+ "tandasnya", "tanpa", "tanya", "tanyakan", "tanyanya", "tapi", "tegas", "tegasnya", "telah", "tempat", "tengah",
82
+ "tentang", "tentu", "tentulah", "tentunya", "tepat", "terakhir", "terasa", "terbanyak", "terdahulu", "terdapat",
83
+ "terdiri", "terhadap", "terhadapnya", "teringat", "teringat-ingat", "terjadi", "terjadilah", "terjadinya",
84
+ "terkira", "terlalu", "terlebih", "terlihat", "termasuk", "ternyata", "tersampaikan", "tersebut", "tersebutlah",
85
+ "tertentu", "tertuju", "terus", "terutama", "tetap", "tetapi", "tiap", "tiba", "tiba-tiba", "tidak", "tidakkah",
86
+ "tidaklah", "tiga", "tinggi", "toh", "tunjuk", "turut", "tutur", "tuturnya", "ucap", "ucapnya", "ujar", "ujarnya",
87
+ "umum", "umumnya", "ungkap", "ungkapnya", "untuk", "usah", "usai", "waduh", "wah", "wahai", "waktu", "waktunya",
88
+ "walau", "walaupun", "wong", "yaitu", "yakin", "yakni", "yang", "ke", "pada", "ini", "itu", "juga", "dari", "dalam",
89
+ "akan", "jika", "maka", "karena", "oleh", "dengan", "atau", "secara", "untuk", "adalah", "sebagai", "bahwa", "hanya",
90
+ "namun", "tetapi", "ketika", "setelah", "sebelum", "selama", "sejak", "hingga", "sampai", "tentang", "seperti",
91
+ "terhadap", "melalui", "menurut", "berdasarkan", "mengenai", "antara", "di", "si", "sang", "para", "the", "of", "and",
92
+ "a", "to", "in", "that", "it", "with", "as", "for", "on", "was", "is", "by", "at", "this", "an", "are", "not", "from",
93
+ "but", "have", "had", "has", "be", "been", "were", "which", "or", "we", "their", "his", "her", "they", "its", "he",
94
+ "she", "you", "my", "all", "can", "would", "could", "should", "may", "might", "must", "shall", "will", "them", "there",
95
+ "these", "those", "some", "any", "no", "nor", "so", "such", "than", "then", "thus", "up", "down", "out", "about", "into",
96
+ "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "what", "who",
97
+ "whom", "this", "that", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
98
+ "does", "did", "doing", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
99
+ "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them",
100
+ "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
101
+ "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the",
102
+ "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
103
+ "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out",
104
+ "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all",
105
+ "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
106
+ "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
107
+ ]
108
+
109
+ def extract_keywords(text, top_n=10):
110
+ """
111
+ Extract keywords from text using a simple frequency-based approach
112
+
113
+ Args:
114
+ text (str): Text to extract keywords from
115
+ top_n (int): Number of keywords to extract
116
+
117
+ Returns:
118
+ list: List of extracted keywords
119
+ """
120
+ # Convert to lowercase
121
+ text = text.lower()
122
+
123
+ # Remove punctuation and split into words
124
+ words = re.findall(r'\b\w+\b', text)
125
+
126
+ # Remove stopwords
127
+ words = [word for word in words if word not in MALAY_STOPWORDS and len(word) > 2]
128
+
129
+ # Count word frequencies
130
+ word_counts = Counter(words)
131
+
132
+ # Get top N keywords
133
+ keywords = [word for word, count in word_counts.most_common(top_n)]
134
+
135
+ # If we have fewer than top_n keywords, return what we have
136
+ return keywords
137
+
138
+ def optimize_keywords_for_platforms(keywords):
139
+ """
140
+ Optimize keywords for different platforms
141
+
142
+ Args:
143
+ keywords (list): List of keywords
144
+
145
+ Returns:
146
+ dict: Dictionary with optimized keywords for each platform
147
+ """
148
+ return {
149
+ "tiktok": keywords[:3],
150
+ "web_search": keywords[:5]
151
+ }
152
+
153
+ def detect_claim_type(text):
154
+ """
155
+ Detect the type of claim based on keywords
156
+
157
+ Args:
158
+ text (str): The claim text
159
+
160
+ Returns:
161
+ str: The type of claim
162
+ """
163
+ text = text.lower()
164
+
165
+ # Define keyword sets for different claim types
166
+ economic_keywords = ["ekonomi", "cukai", "harga", "kewangan", "bank", "ringgit", "subsidi", "kos", "bayaran", "hutang"]
167
+ political_keywords = ["kerajaan", "politik", "perdana menteri", "menteri", "parlimen", "pilihan raya", "parti", "kabinet"]
168
+ health_keywords = ["kesihatan", "penyakit", "hospital", "vaksin", "ubat", "doktor", "covid", "virus", "pandemik"]
169
+ social_keywords = ["sosial", "masyarakat", "pendidikan", "sekolah", "universiti", "pelajar", "guru", "agama"]
170
+ security_keywords = ["keselamatan", "polis", "tentera", "jenayah", "penjenayah", "senjata", "serangan"]
171
+
172
+ # Count matches for each category
173
+ economic_count = sum(1 for keyword in economic_keywords if keyword in text)
174
+ political_count = sum(1 for keyword in political_keywords if keyword in text)
175
+ health_count = sum(1 for keyword in health_keywords if keyword in text)
176
+ social_count = sum(1 for keyword in social_keywords if keyword in text)
177
+ security_count = sum(1 for keyword in security_keywords if keyword in text)
178
+
179
+ # Determine the dominant category
180
+ counts = {
181
+ "Ekonomi": economic_count,
182
+ "Politik": political_count,
183
+ "Kesihatan": health_count,
184
+ "Sosial": social_count,
185
+ "Keselamatan": security_count
186
+ }
187
+
188
+ # Get the category with the highest count
189
+ dominant_category = max(counts, key=counts.get)
190
+
191
+ # If no matches, return "Umum"
192
+ if counts[dominant_category] == 0:
193
+ return "Umum"
194
+
195
+ return dominant_category
196
+
197
+ if __name__ == "__main__":
198
+ # Test the function
199
+ test_text = "Perkenal Cukai Khas Minyak Sawit Mentah Adalah Cadangan Sebuah Persatuan, Bukannya Kerajaan"
200
+ keywords = extract_keywords(test_text)
201
+ print(f"Extracted keywords: {keywords}")
202
+
203
+ optimized = optimize_keywords_for_platforms(keywords)
204
+ print(f"Optimized for TikTok: {optimized['tiktok']}")
205
+ print(f"Optimized for web search: {optimized['web_search']}")
ai_api/library/websearch.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ run_web_search.py
3
+ Module for running web searches and saving results
4
+ """
5
+
6
+ import pandas as pd
7
+ from datetime import datetime
8
+ import os
9
+
10
+ def run(keywords, output_path, num_results=5, use_serpapi=True, use_serper=True, use_duckduckgo=True, full_claim=None):
11
+ """
12
+ Run web search for keywords and save results to CSV
13
+
14
+ Args:
15
+ keywords (list): List of keywords to search for
16
+ output_path (str): Path to save results
17
+ num_results (int): Number of results per keyword
18
+ use_serpapi (bool): Whether to use SerpApi
19
+ use_serper (bool): Whether to use Serper.dev
20
+ use_duckduckgo (bool): Whether to use DuckDuckGo
21
+ full_claim (str): The full claim text to use as a search query
22
+
23
+ Returns:
24
+ int: Number of results saved
25
+ """
26
+ # Import search functions
27
+ try:
28
+ from web_search import search_serpapi, search_serper, search_duckduckgo, get_google_trends
29
+ except ImportError:
30
+ print("Error importing web_search module. Make sure it exists and is accessible.")
31
+ return 0
32
+
33
+ # Create search queries
34
+ all_results = []
35
+
36
+ # Always use the full claim directly if available
37
+ if full_claim:
38
+ print(f"Using full claim as direct search query: '{full_claim}'")
39
+
40
+ # Search using SerpApi with the exact claim
41
+ if use_serpapi:
42
+ print("Searching with SerpApi (exact claim)...")
43
+ serpapi_results = search_serpapi(full_claim, num_results=num_results)
44
+ if serpapi_results:
45
+ print(f"Found {len(serpapi_results)} results from SerpApi (exact claim)")
46
+ all_results.extend(serpapi_results)
47
+ else:
48
+ print("No results from SerpApi (exact claim)")
49
+
50
+ # Search using Serper.dev with the exact claim
51
+ if use_serper:
52
+ print("Searching with Serper.dev (exact claim)...")
53
+ serper_results = search_serper(full_claim, num_results=num_results)
54
+ if serper_results:
55
+ print(f"Found {len(serper_results)} results from Serper.dev (exact claim)")
56
+ all_results.extend(serper_results)
57
+ else:
58
+ print("No results from Serper.dev (exact claim)")
59
+
60
+ # For crime-related claims, also try targeted queries
61
+ crime_related = any(term in full_claim.lower() for term in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"])
62
+ kelantan_related = "kelantan" in full_claim.lower()
63
+
64
+ if crime_related and kelantan_related:
65
+ # Check if this is about sexual crimes or ammunition
66
+ ammunition_related = any(term in full_claim.lower() for term in ["kelongsong", "peluru", "senjata", "tan"])
67
+
68
+ if ammunition_related:
69
+ targeted_queries = [
70
+ "50 tan kelongsong peluru ditemui",
71
+ "kilang haram proses kelongsong peluru",
72
+ "penemuan kelongsong peluru di kilang",
73
+ "kelongsong peluru musuh negara"
74
+ ]
75
+ else:
76
+ # Default to sexual crime queries
77
+ targeted_queries = [
78
+ "statistik jenayah seksual di kelantan",
79
+ "kes rogol dan sumbang mahram di kelantan meningkat",
80
+ "pdrm kelantan lapor kes rogol"
81
+ ]
82
+
83
+ for query in targeted_queries:
84
+ print(f"Using targeted query: '{query}'")
85
+
86
+ # Search using SerpApi
87
+ if use_serpapi:
88
+ print(f"Searching with SerpApi (targeted query: {query})...")
89
+ serpapi_results = search_serpapi(query, num_results=num_results//2) # Use fewer results for each targeted query
90
+ if serpapi_results:
91
+ print(f"Found {len(serpapi_results)} results from SerpApi (targeted query)")
92
+ all_results.extend(serpapi_results)
93
+ else:
94
+ print(f"No results from SerpApi (targeted query: {query})")
95
+
96
+ # Search using Serper.dev
97
+ if use_serper:
98
+ print(f"Searching with Serper.dev (targeted query: {query})...")
99
+ serper_results = search_serper(query, num_results=num_results//2) # Use fewer results for each targeted query
100
+ if serper_results:
101
+ print(f"Found {len(serper_results)} results from Serper.dev (targeted query)")
102
+ all_results.extend(serper_results)
103
+ else:
104
+ print(f"No results from Serper.dev (targeted query: {query})")
105
+ else:
106
+ # For other claims, use the original approach with keywords
107
+ # 1. Full claim query (if available)
108
+ full_claim_query = f'"{full_claim}"' if full_claim else None
109
+
110
+ # 2. Keyword-based query
111
+ search_terms = []
112
+ for kw in keywords:
113
+ # If keyword contains spaces (multi-word phrase), wrap in quotes
114
+ if " " in kw:
115
+ search_terms.append(f'"{kw}"')
116
+ else:
117
+ # For single words, don't use quotes to get broader results
118
+ search_terms.append(kw)
119
+
120
+ keyword_query = " OR ".join(search_terms)
121
+
122
+ # Search using full claim first (if available)
123
+ if full_claim_query:
124
+ print(f"Searching with full claim: {full_claim_query}")
125
+
126
+ # Search using SerpApi
127
+ if use_serpapi:
128
+ print("Searching with SerpApi (full claim)...")
129
+ serpapi_results = search_serpapi(full_claim, num_results=num_results)
130
+ if serpapi_results:
131
+ print(f"Found {len(serpapi_results)} results from SerpApi (full claim)")
132
+ all_results.extend(serpapi_results)
133
+ else:
134
+ print("No results from SerpApi (full claim)")
135
+
136
+ # Search using Serper.dev
137
+ if use_serper:
138
+ print("Searching with Serper.dev (full claim)...")
139
+ serper_results = search_serper(full_claim, num_results=num_results)
140
+ if serper_results:
141
+ print(f"Found {len(serper_results)} results from Serper.dev (full claim)")
142
+ all_results.extend(serper_results)
143
+ else:
144
+ print("No results from Serper.dev (full claim)")
145
+
146
+ # Search using keyword query as fallback
147
+ if not all_results or len(all_results) < num_results:
148
+ print(f"Searching with keyword query: {keyword_query}")
149
+
150
+ # Search using SerpApi
151
+ if use_serpapi:
152
+ print("Searching with SerpApi (keywords)...")
153
+ serpapi_results = search_serpapi(keyword_query, num_results=num_results)
154
+ if serpapi_results:
155
+ print(f"Found {len(serpapi_results)} results from SerpApi (keywords)")
156
+ all_results.extend(serpapi_results)
157
+ else:
158
+ print("No results from SerpApi (keywords)")
159
+
160
+ # Search using Serper.dev
161
+ if use_serper:
162
+ print("Searching with Serper.dev (keywords)...")
163
+ serper_results = search_serper(keyword_query, num_results=num_results)
164
+ if serper_results:
165
+ print(f"Found {len(serper_results)} results from Serper.dev (keywords)")
166
+ all_results.extend(serper_results)
167
+ else:
168
+ print("No results from Serper.dev (keywords)")
169
+
170
+ # Add DuckDuckGo results
171
+ if use_duckduckgo:
172
+ query_to_use = full_claim if full_claim else keyword_query
173
+ print(f"Searching with DuckDuckGo using: {query_to_use}")
174
+ duckduckgo_results = search_duckduckgo(query_to_use, num_results=num_results)
175
+ if duckduckgo_results:
176
+ print(f"Found {len(duckduckgo_results)} results from DuckDuckGo")
177
+ all_results.extend(duckduckgo_results)
178
+ else:
179
+ print("No results from DuckDuckGo")
180
+
181
+ # Add Google Trends data
182
+ trends_data = get_google_trends(keywords)
183
+
184
+ # Convert to DataFrame
185
+ if all_results:
186
+ # Remove duplicates based on URL
187
+ unique_results = []
188
+ seen_urls = set()
189
+
190
+ for result in all_results:
191
+ url = result.get('link', '')
192
+ if url and url not in seen_urls:
193
+ seen_urls.add(url)
194
+ unique_results.append(result)
195
+
196
+ print(f"Removed {len(all_results) - len(unique_results)} duplicate results")
197
+
198
+ df = pd.DataFrame(unique_results)
199
+
200
+ # Add additional columns to match the format expected by the sentiment analyzer
201
+ df['platform'] = 'web'
202
+ df['username'] = df['source']
203
+ df['post_text'] = df['snippet']
204
+ df['post_url'] = df['link']
205
+ df['likes'] = 0
206
+ df['shares'] = 0
207
+ df['comments_count'] = 0
208
+ df['comment_text'] = ''
209
+ df['combined_text'] = df['title'] + ' ' + df['snippet']
210
+ df['date'] = datetime.now().strftime('%Y-%m-%d')
211
+
212
+ # Create output directory if it doesn't exist
213
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
214
+
215
+ # Save to CSV
216
+ df.to_csv(output_path, index=False)
217
+ print(f"Saved {len(df)} web search results to {output_path}")
218
+ return len(df)
219
+ else:
220
+ print("No web search results found")
221
+ return 0
222
+
223
+ # Test the module
224
+ if __name__ == "__main__":
225
+ import sys
226
+
227
+ # Get keywords from command line or use default
228
+ if len(sys.argv) > 1:
229
+ keywords = sys.argv[1:]
230
+ full_claim = " ".join(sys.argv[1:])
231
+ else:
232
+ keywords = ["polis", "kelantan", "sumbang mahram", "rogol"]
233
+ full_claim = "Polis Kelantan bimbang kes sumbang mahram dan rogol di Kelantan"
234
+
235
+ # Run web search
236
+ output_path = "output/web_search_results.csv"
237
+ run_web_search(keywords, output_path, num_results=10, full_claim=full_claim)
ai_api/middleware.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # middleware.py
2
+ import hashlib
3
+ import hmac
4
+ from django.http import JsonResponse
5
+ from ai_api.models import APIClient
6
+
7
+ class HMACAuthMiddleware:
8
+ def __init__(self, get_response):
9
+ self.get_response = get_response
10
+
11
+ def __call__(self, request):
12
+ # if request.path.startswith('/admin/'):
13
+ # return self.get_response(request)
14
+ if not request.path.startswith('/api/'):
15
+ return self.get_response(request)
16
+
17
+ client_id = request.headers.get('X-Client-ID')
18
+ signature = request.headers.get('X-Signature')
19
+
20
+ if not client_id or not signature:
21
+ return JsonResponse({'error': 'Missing credentials'}, status=401)
22
+
23
+ from ai_api.models import APIClient
24
+ try:
25
+ client = APIClient.objects.get(client_id=client_id)
26
+ except APIClient.DoesNotExist:
27
+ return JsonResponse({'error': 'Invalid client ID'}, status=401)
28
+
29
+ expected_signature = hmac.new(
30
+ client.secret_key.encode(),
31
+ request.body,
32
+ hashlib.sha256
33
+ ).hexdigest()
34
+
35
+ if not hmac.compare_digest(expected_signature, signature):
36
+ return JsonResponse({'error': 'Invalid signature'}, status=401)
37
+
38
+ request.api_client = client
39
+ return self.get_response(request)
40
+
ai_api/migrations/0001_initial.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by Django 4.2.20 on 2025-05-08 00:50
2
+
3
+ from django.db import migrations, models
4
+
5
+
6
+ class Migration(migrations.Migration):
7
+
8
+ initial = True
9
+
10
+ dependencies = [
11
+ ]
12
+
13
+ operations = [
14
+ migrations.CreateModel(
15
+ name='APIClient',
16
+ fields=[
17
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
18
+ ('name', models.CharField(max_length=100, unique=True)),
19
+ ('client_id', models.CharField(editable=False, max_length=32, unique=True)),
20
+ ('secret_key', models.CharField(editable=False, max_length=64)),
21
+ ('created_at', models.DateTimeField(auto_now_add=True)),
22
+ ],
23
+ ),
24
+ ]
ai_api/migrations/__init__.py ADDED
File without changes
ai_api/models.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.db import models
2
+ import secrets
3
+
4
+ class APIClient(models.Model):
5
+ name = models.CharField(max_length=100, unique=True)
6
+ client_id = models.CharField(max_length=32, unique=True, editable=False)
7
+ secret_key = models.CharField(max_length=64, editable=False)
8
+ created_at = models.DateTimeField(auto_now_add=True)
9
+
10
+ def save(self, *args, **kwargs):
11
+ if not self.client_id:
12
+ self.client_id = secrets.token_hex(16)
13
+ if not self.secret_key:
14
+ self.secret_key = secrets.token_hex(32)
15
+ super().save(*args, **kwargs)
16
+
17
+ def __str__(self):
18
+ return self.name
ai_api/request_serializer.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rest_framework import serializers
2
+
3
+ class TranscriptionRequestSerializer(serializers.Serializer):
4
+ url = serializers.URLField(required=False, allow_null=True)
5
+ media = serializers.FileField(required=False, allow_null=True)
6
+
7
+ def validate(self, attrs):
8
+ url = attrs.get('url')
9
+ media = attrs.get('media')
10
+
11
+ if not url and not media:
12
+ raise serializers.ValidationError("Either 'url' or 'media' must be provided.")
13
+
14
+ return attrs
15
+
16
+ def validate_media(self, file):
17
+ if file is None:
18
+ return file
19
+
20
+ allowed_types = ['audio/', 'video/']
21
+ content_type = getattr(file, 'content_type', '')
22
+
23
+ if not any(content_type.startswith(t) for t in allowed_types):
24
+ raise serializers.ValidationError("Only audio or video files are allowed.")
25
+
26
+ return file
27
+
28
+ class ClassificationRequestSerializer(serializers.Serializer):
29
+ claim = serializers.CharField()
30
+
ai_api/templates/base-copy.html ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- templates/base.html -->
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>{% block title %}My Django Project{% endblock %}</title>
8
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
9
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
10
+ </head>
11
+ <body>
12
+ <!-- Navbar (optional) -->
13
+ <nav class="navbar navbar-expand-lg navbar-light bg-light ps-2">
14
+ <a class="navbar-brand" href="/">Home</a>
15
+ </nav>
16
+
17
+ <!-- Main content area -->
18
+ <div class="container m-2">
19
+ {% block content %}{% endblock %}
20
+ </div>
21
+
22
+ <!-- Footer (optional) -->
23
+ <footer class="bg-light text-center py-3">
24
+ <p>&copy; 2025 BERNAMA Fact Check</p>
25
+ </footer>
26
+
27
+ <!-- jQuery Library -->
28
+ <script src="https://code.jquery.com/jquery-3.6.4.min.js"
29
+ integrity="sha256-oP6HI9z1XaZNBrJURtCoUT5SUnxFr8s3BzRl+cbzUq8="
30
+ crossorigin="anonymous"></script>
31
+
32
+
33
+ {% block scripts %}{% endblock %}
34
+ </body>
35
+ </html>
ai_api/templates/base.html ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>{% block title %}BERNAMA Fact Check{% endblock %}</title>
7
+ {% load static %}
8
+
9
+ <!-- Bootstrap CSS -->
10
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
11
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css" integrity="sha512-...hash..." crossorigin="anonymous" referrerpolicy="no-referrer" />
12
+ <link rel="stylesheet" href="{% static 'js/DataTables/datatables.min.css' %}">
13
+
14
+ <link rel="icon" href="{% static 'favicon.ico' %}" type="image/x-icon">
15
+ <!-- Optional: Custom dark mode toggle -->
16
+ <style>
17
+ body.dark-mode {
18
+ background-color: #121212;
19
+ color: #f8f9fa;
20
+ }
21
+ body.dark-mode .bg-light {
22
+ background-color: #1f1f1f !important;
23
+ }
24
+ body.dark-mode .text-muted {
25
+ color: #adb5bd !important;
26
+ }
27
+ </style>
28
+ </head>
29
+ <body class="dark-mode">
30
+
31
+ <!-- Hero Section -->
32
+ <section class="py-5 bg-light text-center shadow">
33
+ <div class="container">
34
+ <h1 class="display-5 fw-bold mb-3">AI Feature Testing Bed</h1>
35
+ <p class="lead text-muted mb-4">Experiment with cutting-edge AI modules like Face Recognition and Speech Transcription in one place.</p>
36
+ <a href="/#features" class="btn btn-primary btn-lg">Explore Features</a>
37
+ </div>
38
+ </section>
39
+
40
+ <!-- Main Section -->
41
+ <section class="py-5">
42
+ <div class="container">
43
+ {% block content %}{% endblock %}
44
+ </div>
45
+ </section>
46
+
47
+ <!-- Footer -->
48
+ <footer class="text-center py-4 text-muted">
49
+ © 2025 BERNAMA Fact Check. All rights reserved.
50
+ </footer>
51
+
52
+ <!-- Bootstrap JS Bundle (with Popper) -->
53
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
54
+
55
+ <!-- jQuery -->
56
+ <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
57
+ <script src="{% static 'js/DataTables/datatables.min.js' %}"></script>
58
+
59
+ {% block scripts %}{% endblock %}
60
+ </body>
61
+ </html>
ai_api/templates/classification.html ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends 'base.html' %}
2
+
3
+ {% block content %}
4
+ <div class="container py-4">
5
+ <h2 class="mb-4 fw-bold text-white">Classification</h2>
6
+
7
+ <form id="classificationForm" method="POST">
8
+ {% csrf_token %}
9
+ {{ form.as_p }}
10
+ <button type="submit" class="btn btn-primary mt-3">
11
+ Submit
12
+ </button>
13
+ </form>
14
+
15
+ <!-- Progress Bar -->
16
+ <div id="progressContainer" class="mt-4" style="display: none;">
17
+ <div class="progress">
18
+ <div id="progressBar" class="progress-bar progress-bar-striped progress-bar-animated" role="progressbar" style="width: 0%"></div>
19
+ </div>
20
+ <p id="progressText" class="text-white mt-2"></p>
21
+ </div>
22
+
23
+ <!-- Results Container -->
24
+ <div id="resultsContainer" style="display: none;">
25
+ <div class="alert alert-secondary text-uppercase small mt-4">
26
+ <p><strong>Category:</strong> <span id="category"></span></p>
27
+ <p><strong>Keywords:</strong> <span id="keywords"></span></p>
28
+ <p><strong>Priority Index:</strong> <span id="priorityScore"></span>/10</p>
29
+ </div>
30
+
31
+ <div class="row g-4 mt-2" id="priorityCards">
32
+ <!-- Cards will be dynamically inserted here -->
33
+ </div>
34
+
35
+ <div class="row mt-2 table-responsive" id="sentimentTable">
36
+ <!-- Sentiment table will be dynamically inserted here -->
37
+ </div>
38
+ </div>
39
+ </div>
40
+ {% endblock %}
41
+
42
+ {% block scripts %}
43
+ <script>
44
+ $(document).ready(function(){
45
+ let progressInterval;
46
+
47
+ $('#classificationForm').on('submit', function(e) {
48
+ e.preventDefault();
49
+
50
+ // Reset and show progress
51
+ $('#progressContainer').show();
52
+ $('#resultsContainer').hide();
53
+ $('#progressBar').css('width', '0%');
54
+ $('#progressText').text('Starting...');
55
+
56
+ // Clear any existing interval
57
+ if (progressInterval) {
58
+ clearInterval(progressInterval);
59
+ }
60
+
61
+ // Get form data
62
+ const formData = new FormData(this);
63
+ const progressKey = Date.now().toString();
64
+ formData.append('progress_key', progressKey);
65
+
66
+ // Start progress checking
67
+ progressInterval = setInterval(() => {
68
+ $.get(`/progress/${progressKey}/`, function(data) {
69
+ $('#progressBar').css('width', `${data.percent}%`);
70
+ $('#progressText').text(`${data.stage}...`);
71
+
72
+ if (data.stage === 'complete') {
73
+ clearInterval(progressInterval);
74
+ }
75
+ });
76
+ }, 1000);
77
+
78
+ // Submit form via AJAX
79
+ $.ajax({
80
+ url: window.location.pathname,
81
+ type: 'POST',
82
+ data: formData,
83
+ processData: false,
84
+ contentType: false,
85
+ success: function(response) {
86
+ clearInterval(progressInterval);
87
+ $('#progressContainer').hide();
88
+ $('#resultsContainer').show();
89
+
90
+ // Update results
91
+ $('#category').text(response.classification);
92
+ $('#keywords').text(response.keywords.join(', '));
93
+ $('#priorityScore').text(response.priority_data.priority_score.toFixed(1));
94
+
95
+ // Update priority cards
96
+ const priorityFlags = response.priority_data.priority_flags;
97
+ const cardData = [
98
+ { title: 'Does it have fact-check news value?', flag: 'fact_check_value', bg: 'bg-primary' },
99
+ { title: 'Could it cause confusion?', flag: 'cause_confusion', bg: 'bg-secondary' },
100
+ { title: 'Could it cause chaos?', flag: 'cause_chaos', bg: 'bg-success' },
101
+ { title: 'Does it affect government?', flag: 'affects_government', bg: 'bg-danger' },
102
+ { title: 'Immediate economic impact?', flag: 'economic_impact', bg: 'bg-warning' },
103
+ { title: 'Have laws been broken/bent?', flag: 'law_related', bg: 'bg-info' },
104
+ { title: 'Is it in the public interest?', flag: 'public_interest', bg: 'bg-light' },
105
+ { title: 'Are lives in danger?', flag: 'lives_in_danger', bg: 'bg-dark' },
106
+ { title: 'Is it already viral?', flag: 'viral', bg: 'bg-warning' },
107
+ { title: 'Is it urgent or time sensitive?', flag: 'urgent', bg: 'bg-success' }
108
+ ];
109
+
110
+ let cardsHtml = '';
111
+ cardData.forEach(card => {
112
+ cardsHtml += `
113
+ <div class="col-12 col-sm-6 col-md-4 col-lg-3">
114
+ <div class="card text-white ${card.bg} h-100 shadow">
115
+ <div class="card-body">
116
+ <h5 class="card-title" style="height: 50px;">${card.title}</h5>
117
+ <p class="card-text fs-1">${priorityFlags[card.flag] ? 'Yes' : 'No'}</p>
118
+ </div>
119
+ </div>
120
+ </div>
121
+ `;
122
+ });
123
+ $('#priorityCards').html(cardsHtml);
124
+
125
+ // Update sentiment table if available
126
+ if (response.sentiment_data && response.sentiment_data.table_html) {
127
+ $('#sentimentTable').html(response.sentiment_data.table_html);
128
+ $('#sentimentTable table').DataTable({
129
+ responsive: true
130
+ });
131
+ }
132
+ },
133
+ error: function(xhr) {
134
+ clearInterval(progressInterval);
135
+ $('#progressContainer').hide();
136
+ alert('Error: ' + (xhr.responseJSON?.error || 'An error occurred'));
137
+ }
138
+ });
139
+ });
140
+ });
141
+ </script>
142
+ {% endblock %}
ai_api/templates/home-copy.html ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- templates/home.html -->
2
+ {% extends 'base.html' %}
3
+
4
+ {% block title %}Welcome to My Homepage{% endblock %}
5
+
6
+ {% block content %}
7
+ <h1>BERNAMA Fact Check Test Bed!</h1>
8
+ <div class="row col-12 mb-2">
9
+ <div class="card col-3 m-1">
10
+ <div class="card-body">
11
+ <h5 class="card-title">Claim Classification</h5>
12
+ <p class="card-text">Input a claim and submit for AI to classify the statement.</p>
13
+ <a href="/classification" class="btn btn-primary">Test Now</a>
14
+ </div>
15
+ </div>
16
+ <div class="card col-3 m-1">
17
+ <div class="card-body">
18
+ <h5 class="card-title">Image Profiling</h5>
19
+ <p class="card-text">Upload an image for AI to analyze.</p>
20
+ <a href="/image_profiling" class="btn btn-primary">Test Now</a>
21
+ </div>
22
+ </div>
23
+ <div class="card col-3 m-1">
24
+ <div class="card-body">
25
+ <h5 class="card-title">Register New Face</h5>
26
+ <p class="card-text">Insert a person name for AI to learn face recongnition.</p>
27
+ <a href="/register_face" class="btn btn-primary">Test Now</a>
28
+ </div>
29
+ </div>
30
+ <div class="card col-3 m-1">
31
+ <div class="card-body">
32
+ <h5 class="card-title">Transcription</h5>
33
+ <p class="card-text">Audio/Video to transcription (text)</p>
34
+ <a href="/transcription" class="btn btn-primary">Test Now</a>
35
+ </div>
36
+ </div>
37
+ </div>
38
+ {% endblock %}
ai_api/templates/home.html ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends 'base.html' %}
2
+
3
+ {% block title %}BERNAMA Fact Check{% endblock %}
4
+
5
+ {% block content %}
6
+
7
+ <!-- Features Section -->
8
+ <section id="features" class="py-5">
9
+ <div class="container">
10
+ <h2 class="text-center fw-bold mb-5 display-6">Core AI Modules</h2>
11
+ <div class="row g-4">
12
+ <!-- Feature Card -->
13
+ <div class="col-12 col-md-6 col-lg-4">
14
+ <a href="/classification" class="text-decoration-none">
15
+ <div class="card h-100 shadow-sm hover-shadow transition">
16
+ <div class="card-body">
17
+ <h5 class="card-title">Claim Classification</h5>
18
+ <p class="card-text text-muted">Input a claim and submit for AI to classify the statement.</p>
19
+ </div>
20
+ </div>
21
+ </a>
22
+ </div>
23
+
24
+ <div class="col-12 col-md-6 col-lg-4">
25
+ <a href="/transcription" class="text-decoration-none">
26
+ <div class="card h-100 shadow-sm hover-shadow transition">
27
+ <div class="card-body">
28
+ <h5 class="card-title">Transcription</h5>
29
+ <p class="card-text text-muted">Convert spoken words into text using advanced speech-to-text models.</p>
30
+ </div>
31
+ </div>
32
+ </a>
33
+ </div>
34
+
35
+ <div class="col-12 col-md-6 col-lg-4">
36
+ <a href="/image_profiling" class="text-decoration-none">
37
+ <div class="card h-100 shadow-sm hover-shadow transition">
38
+ <div class="card-body">
39
+ <h5 class="card-title">Image Processing</h5>
40
+ <p class="card-text text-muted">Image profiling; face detection, metadata, captioning etc.</p>
41
+ </div>
42
+ </div>
43
+ </a>
44
+ </div>
45
+
46
+ <div class="col-12 col-md-6 col-lg-4">
47
+ <a href="/register_face" class="text-decoration-none">
48
+ <div class="card h-100 shadow-sm hover-shadow transition">
49
+ <div class="card-body">
50
+ <h5 class="card-title">Face Register</h5>
51
+ <p class="card-text text-muted">Register new face.</p>
52
+ </div>
53
+ </div>
54
+ </a>
55
+ </div>
56
+ </div>
57
+ </div>
58
+ </section>
59
+
60
+ {% endblock %}
ai_api/templates/image_profiling.html ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends 'base.html' %}
2
+ {% block content %}
3
+ <h2 class="mb-4 fw-bold text-white">Image Processing</h2>
4
+
5
+ <form class="mb-4" method="POST" enctype="multipart/form-data">
6
+ {% csrf_token %}
7
+ {{ form.as_p }}
8
+ <button type="submit" class="btn btn-primary">Upload Image</button>
9
+ </form>
10
+
11
+ {% if proccessed %}
12
+ <div class="mt-4">
13
+ <div class="nav nav-tabs" id="myTab" role="tablist">
14
+ <a class="nav-item nav-link active" id="home-tab" data-bs-toggle="tab" href="#home-tab-pane">Uploaded Image</a>
15
+ <a class="nav-item nav-link" id="profile-tab" data-bs-toggle="tab" href="#profile-tab-pane">Face Detects</a>
16
+ <a class="nav-item nav-link" id="contact-tab" data-bs-toggle="tab" href="#contact-tab-pane">OCR Texts</a>
17
+ <a class="nav-item nav-link" id="disabled-tab" data-bs-toggle="tab" href="#disabled-tab-pane">Metadata</a>
18
+ <a class="nav-item nav-link" id="augmentive-tab" data-bs-toggle="tab" href="#augmentive-tab-pane">Augmentive</a>
19
+ </div>
20
+
21
+ <div class="tab-content mt-4">
22
+ <div id="home-tab-pane" class="tab-pane fade show active">
23
+ <img class="img-fluid mx-auto rounded" src="{{ uploaded_base64 }}" alt="Uploaded Image">
24
+ </div>
25
+
26
+ <div id="profile-tab-pane" class="tab-pane fade">
27
+ {% if cropped_faces %}
28
+ <div class="row g-3">
29
+ <div class="col-md-5">
30
+ <h3 class="mt-4 fw-bold">Detected Faces</h3>
31
+ <img class="img-fluid rounded" src="{{ image_with_labels }}" alt="Detected Faces">
32
+ </div>
33
+
34
+ <div class="col-md-7">
35
+ <h3 class="mt-4 fw-bold">Cropped Faces</h3>
36
+ <div class="d-flex flex-wrap gap-4">
37
+ {% for face, face_name, distance, fdescription in cropped_faces %}
38
+ <div class="text-center text-xs" style="width: 80px;">
39
+ <img src="{{ face }}" alt="Cropped Face" class="img-thumbnail img-fluid mb-1">
40
+ <div style="font-size:10px">
41
+ <strong>{{ face_name }}</strong><br>{{ fdescription }}
42
+ </div>
43
+ </div>
44
+ {% endfor %}
45
+ </div>
46
+ </div>
47
+ </div>
48
+ {% endif %}
49
+ </div>
50
+
51
+ <div id="contact-tab-pane" class="tab-pane fade">
52
+ {% if texts %}
53
+ <div class="d-flex flex-wrap gap-2">
54
+ {% for text in texts %}
55
+ <span class="badge bg-success text-white">{{ text }}</span>
56
+ {% endfor %}
57
+ </div>
58
+ {% endif %}
59
+ </div>
60
+
61
+ <div id="disabled-tab-pane" class="tab-pane fade">
62
+ <div class="d-flex flex-wrap gap-4">
63
+ {% if metadata %}
64
+ <div class="w-100">
65
+ <table class="table table-sm table-striped">
66
+ <thead class="table-light">
67
+ <tr>
68
+ <th>IPTC Field</th>
69
+ <th>Value</th>
70
+ </tr>
71
+ </thead>
72
+ <tbody>
73
+ {% for tag, value in metadata.items %}
74
+ <tr>
75
+ <td>{{ tag }}</td>
76
+ <td>{{ value }}</td>
77
+ </tr>
78
+ {% endfor %}
79
+ </tbody>
80
+ </table>
81
+ </div>
82
+ {% endif %}
83
+
84
+ {% if exifs %}
85
+ <div class="w-100">
86
+ <table class="table table-sm table-striped">
87
+ <thead class="table-light">
88
+ <tr>
89
+ <th>EXIF Field</th>
90
+ <th>Value</th>
91
+ </tr>
92
+ </thead>
93
+ <tbody>
94
+ {% for tag, value in exifs.items %}
95
+ <tr>
96
+ <td>{{ tag }}</td>
97
+ <td>{{ value }}</td>
98
+ </tr>
99
+ {% endfor %}
100
+ </tbody>
101
+ </table>
102
+ </div>
103
+ {% endif %}
104
+ </div>
105
+ </div>
106
+
107
+ <div id="augmentive-tab-pane" class="tab-pane fade">
108
+ {% if description %}
109
+ <h3 class="fw-semibold">{{ description }}</h3>
110
+ {% endif %}
111
+ </div>
112
+
113
+ <div id="reverse-tab-pane" class="tab-pane fade">
114
+ {% if reverse_images %}
115
+ {{ reverse_images }}
116
+ {% endif %}
117
+ </div>
118
+ </div>
119
+ </div>
120
+
121
+ {% endif %}
122
+ {% endblock %}
ai_api/templates/register_face.html ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends 'base.html' %}
2
+ {% block content %}
3
+ <h2 class="mb-4 fw-bold text-white">Face Register</h2>
4
+
5
+ <form method="POST" enctype="multipart/form-data" class="mb-4">
6
+ {% csrf_token %}
7
+ <div class="row g-4">
8
+ <div class="col-md-6">
9
+ <label for="{{ form.person.id_for_label }}" class="form-label">
10
+ {{ form.person.label }}
11
+ </label>
12
+ {{ form.person }}
13
+ </div>
14
+
15
+ <div class="col-md-6">
16
+ <label for="{{ form.keywords.id_for_label }}" class="form-label">
17
+ {{ form.keywords.label }}
18
+ </label>
19
+ {{ form.keywords }}
20
+ </div>
21
+ </div>
22
+
23
+ <div class="row g-4">
24
+ <div class="col-md-6">
25
+ <label for="{{ form.images.id_for_label }}" class="form-label">
26
+ {{ form.images.label }}
27
+ </label>
28
+ {{ form.images }}
29
+ </div>
30
+ </div>
31
+
32
+ <button type="submit" class="btn btn-primary mt-2">
33
+ Register
34
+ </button>
35
+ </form>
36
+
37
+ {% if result %}
38
+ <div class="mt-4 bg-light p-4 rounded shadow-sm">
39
+ <p class="mb-0">{{ result }}</p>
40
+ </div>
41
+ {% endif %}
42
+ {% endblock %}
ai_api/templates/transcription.html ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends 'base.html' %}
2
+ {% block content %}
3
+ <h2 class="mb-4 fw-bold text-white">Transcription</h2>
4
+
5
+ <form method="post" class="mb-3" id="yt-form" enctype="multipart/form-data">
6
+ {% csrf_token %}
7
+ {{ form.as_p }}
8
+ <input type="hidden" value="{{progress_key}}" name="progress_key">
9
+ <button type="submit" class="btn btn-primary" id="btnSubmit">
10
+ Transcribe
11
+ </button>
12
+ </form>
13
+
14
+ <!-- Progress Bar -->
15
+ <div class="progress mb-4 d-none" id="progress-container">
16
+ <div class="progress-bar progress-bar-striped progress-bar-animated"
17
+ role="progressbar"
18
+ aria-valuenow="0"
19
+ aria-valuemin="0"
20
+ aria-valuemax="100"
21
+ style="width: 0%">
22
+ </div>
23
+ </div>
24
+
25
+ <!-- Transcription Result -->
26
+ <div id="transcription" class="d-none">
27
+ <div class="bg-light p-4 rounded shadow-sm">
28
+ <div class="container"></div>
29
+ </div>
30
+ </div>
31
+ {% endblock %}
32
+
33
+ {% block scripts %}
34
+ <script src="https://rawcdn.githack.com/mozilla/vtt.js/master/dist/vtt.min.js"></script>
35
+
36
+
37
+
38
+ <script>
39
+ function copyToClipboard(selector) {
40
+ const text = $(selector).text(); // Get innerText
41
+ navigator.clipboard.writeText(text)
42
+ .then(() => {
43
+ // console.log('Copied to clipboard:', text);
44
+ })
45
+ .catch(err => {
46
+ console.error('Failed to copy:', err);
47
+ });
48
+ }
49
+
50
+ function ucfirst(str) {
51
+ if (!str) return '';
52
+ return str.charAt(0).toUpperCase() + str.slice(1);
53
+ }
54
+
55
+ $(document).ready(function () {
56
+ $('#yt-form').on('submit', function (e) {
57
+ e.preventDefault();
58
+
59
+ $('#btnSubmit').text('Downloading...');
60
+ $('#btnSubmit').prop('disabled', true);
61
+
62
+ $('#progress-container').removeClass('d-none');
63
+ const $bar = $('.progress-bar');
64
+ const key = '{{ progress_key }}';
65
+ var formData = new FormData(this);
66
+
67
+ $.ajax({
68
+ url: '.',
69
+ type: 'POST',
70
+ data: formData,
71
+ processData: false,
72
+ contentType: false,
73
+ success: function (response) {
74
+ if (response.segments) {
75
+ $('#transcription').removeClass('d-none');
76
+
77
+ $('#progress-container').removeClass('d-none');
78
+ $('#transcription .container').empty(); // Clear previous content
79
+
80
+ // Insert audio HTML
81
+ $('#transcription .container').append(response.audio_file);
82
+
83
+ // Add subtitle box
84
+ const subtitleBox = $('<div id="subtitleBox" style="padding:1em;background:#222;color:white;margin-top:10px;min-height:40px;"></div>')
85
+ .text("Play the audio");
86
+ $('#transcription .container').append(subtitleBox);
87
+
88
+ // Get the audio file URL from the HTML string
89
+ const audioSrcMatch = response.audio_file.match(/src="([^"]+)"/);
90
+ if (!audioSrcMatch) return;
91
+
92
+ const audioUrl = audioSrcMatch[1]; // /media/uploads/file.wav
93
+ const vttUrl = audioUrl.replace('/uploads/', '/vtt/').replace(/\.\w+$/, '.vtt'); // change extension to .vtt
94
+
95
+ // Load and parse the VTT file using vtt.js
96
+ const audio = document.querySelector('#transcription audio');
97
+ let cues = [];
98
+
99
+ fetch(vttUrl)
100
+ .then(res => res.text())
101
+ .then(vttData => {
102
+ const parser = new WebVTT.Parser(window, WebVTT.StringDecoder());
103
+ parser.oncue = function (cue) {
104
+ cues.push(cue);
105
+ };
106
+ parser.parse(vttData);
107
+ parser.flush();
108
+ });
109
+
110
+ audio.addEventListener('timeupdate', () => {
111
+ const currentTime = audio.currentTime;
112
+ const activeCue = cues.find(cue => currentTime >= cue.startTime && currentTime <= cue.endTime);
113
+ document.getElementById('subtitleBox').textContent = activeCue ? activeCue.text : '';
114
+ });
115
+
116
+ $('<div class="accordion">\
117
+ <div class="accordion-item">\
118
+ <h2 class="accordion-header" id="headingOne">\
119
+ <button class="accordion-button" type="button" data-bs-toggle="collapse" data-bs-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">\
120
+ Full Transcription \
121
+ </button>\
122
+ </h2>\
123
+ <div id="collapseOne" class="accordion-collapse collapse show" aria-labelledby="headingOne" data-bs-parent="#accordionExample">\
124
+ <div class="accordion-body">\
125
+ <div class="float-end"> <a href="'+vttUrl+'" download class="btn btn-sm btn-info me-1" title="Download"> <i class="fa fa-download"></i></a><button class="btn btn-sm me-1 btn-info" title="Copy" onClick="copyToClipboard(\'#segments\')"> <i class="fa fa-clipboard"></i></button></div>\
126
+ <div class="mt-3" id="segments"></div>\
127
+ </div>\
128
+ </div>\
129
+ </div>\
130
+ </div>').appendTo('#transcription .container');
131
+
132
+ $.each(response.segments, function(index, segment) {
133
+ var pElement = $('<pre></pre>').text(segment.text);
134
+ $('#segments').append(pElement);
135
+ });
136
+
137
+ }
138
+ }
139
+ });
140
+
141
+
142
+ const interval = setInterval(function () {
143
+ $.getJSON(`/progress/${key}/`, function (data) {
144
+ $bar.css('width', data.percent + '%');
145
+ $bar.attr('aria-valuenow', data.percent);
146
+ // $bar.html(data.percent + '%');
147
+ $('#btnSubmit').text(ucfirst(data.stage) + '...');
148
+
149
+ if (data.stage === 'done') {
150
+ $('#btnSubmit').prop('disabled', false).text('Transcribe');
151
+ clearInterval(interval);
152
+ $('#progress-container').addClass('d-none');
153
+ }
154
+ });
155
+ }, 1000);
156
+ });
157
+ });
158
+ </script>
159
+ {% endblock %}
ai_api/tests.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.test import TestCase
2
+
3
+ # Create your tests here.
ai_api/urls.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.urls import path
2
+ from . import views
3
+
4
+
5
+ urlpatterns = [
6
+ path('', views.home, name='home'),
7
+ path('classification/', views.classification, name='classification'),
8
+ path('image_profiling/', views.image_profiling, name='image_profiling'),
9
+ path('register_face/', views.register_face, name='register_face'),
10
+ path('transcription/', views.transcription, name='transcription'),
11
+ path('progress/<str:key>/', views.check_progress, name='check_progress'),
12
+ ]
ai_api/views.py ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.shortcuts import render
2
+ from django.http import JsonResponse
3
+ from .forms import ImageUploadForm, ClassificationForm, RegisterFaceForm,TranscribeForm, YouTubeURLForm
4
+ import shutil
5
+ from django.conf import settings
6
+ import torch
7
+ import json
8
+ import os
9
+ from PIL import Image as PILImage
10
+ import io
11
+ import tempfile
12
+ from django.core.cache import cache
13
+ import numpy as numpy_lib
14
+ import pickle
15
+ from deepface import DeepFace
16
+ import cv2
17
+ import base64
18
+ from io import BytesIO
19
+ from . import globals
20
+ import tempfile
21
+ import mimetypes
22
+ import subprocess
23
+ import logging
24
+ import uuid
25
+ import yt_dlp
26
+ import time
27
+ import re
28
+ from pydub import AudioSegment
29
+ import pandas as pd
30
+ import csv
31
+
32
+
33
+ # Setup logging for error handling
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # from ai_api.library.devlab_image import DevLabImage
37
+
38
+ # devlab_image = DevLabImage()
39
+
40
+
41
+ model = globals.model
42
+ tokenizer = globals.tokenizer
43
+ devlab_image = globals.devlab_image
44
+
45
+ with open(f"{globals.save_path}/label_map.json", "r") as f:
46
+ label_map = json.load(f)
47
+
48
+ index_to_label = {v: k for k, v in label_map.items()}
49
+
50
+
51
+ # Create your views here.
52
+ def home(request):
53
+ return render(request, 'home.html')
54
+
55
+
56
+ def classification(request):
57
+ from .library import simple_keyword_extraction, apify_scraper, priority_indexer, websearch, lowyat_crawler, sentiment_analyzer
58
+
59
+ if request.method == 'POST':
60
+ progress_key = request.POST.get("progress_key", str(uuid.uuid4()))
61
+ cache.set(progress_key, {'stage': 'starting', 'percent': 0})
62
+
63
+ text = request.POST.get("claim", "")
64
+ if not text:
65
+ return JsonResponse({"error": "No text provided"}, status=400)
66
+
67
+ claim_id = str(uuid.uuid4())[:8]
68
+
69
+ try:
70
+ # Step 1: Classification
71
+ cache.set(progress_key, {'stage': 'classifying', 'percent': 10})
72
+ inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
73
+ with torch.no_grad():
74
+ outputs = model(**inputs)
75
+ prediction = torch.argmax(outputs.logits, dim=-1).item()
76
+ classification_result = index_to_label.get(prediction, "Unknown")
77
+
78
+ # Step 2: Keyword Extraction
79
+ cache.set(progress_key, {'stage': 'extracting_keywords', 'percent': 20})
80
+ keywords = simple_keyword_extraction.extract_keywords(text)
81
+
82
+ # Step 3: Setup paths
83
+ output_path = os.path.join(settings.BASE_DIR, 'ai_api', 'library', 'output')
84
+ report_path = os.path.join(settings.BASE_DIR, 'ai_api', 'library', 'reports')
85
+ raw_data_path = os.path.join(output_path, f'{claim_id}.csv')
86
+
87
+ # Step 4: Run TikTok scraper
88
+ cache.set(progress_key, {'stage': 'scraping_tiktok', 'percent': 30})
89
+ apify_scraper.run(
90
+ keywords,
91
+ output_path=raw_data_path,
92
+ )
93
+
94
+ # Step 5: Run web search
95
+ cache.set(progress_key, {'stage': 'searching_web', 'percent': 50})
96
+ web_search_results = websearch.run(
97
+ keywords,
98
+ output_path=os.path.join(output_path, f"{claim_id}_web.json"),
99
+ full_claim=text
100
+ )
101
+
102
+ # Step 6: Run Lowyat forum crawler
103
+ cache.set(progress_key, {'stage': 'crawling_forum', 'percent': 60})
104
+ lowyat_path = os.path.join(output_path, f"{claim_id}_lowyat.csv")
105
+ lowyat_sections = ["Kopitiam", "SeriousKopitiam"]
106
+ lowyat_results = lowyat_crawler.run(
107
+ keywords,
108
+ sections=lowyat_sections,
109
+ output_path=lowyat_path,
110
+ full_claim=text
111
+ )
112
+
113
+ # Step 7: Combine datasets
114
+ cache.set(progress_key, {'stage': 'combining_data', 'percent': 70})
115
+ if os.path.exists(lowyat_path):
116
+ lowyat_df = pd.read_csv(lowyat_path)
117
+ if os.path.exists(raw_data_path):
118
+ main_df = pd.read_csv(raw_data_path)
119
+ combined_df = pd.concat([main_df, lowyat_df], ignore_index=True)
120
+ combined_df.to_csv(raw_data_path, index=False)
121
+ else:
122
+ lowyat_df.to_csv(raw_data_path, index=False)
123
+
124
+ # Step 8: Run sentiment analysis
125
+ cache.set(progress_key, {'stage': 'analyzing_sentiment', 'percent': 80})
126
+ sentiment_csv = os.path.join(output_path, f"{claim_id}_sentiment.csv")
127
+ sentiment_data = {}
128
+
129
+ if os.path.exists(raw_data_path):
130
+ sentiment_analyzer.run(raw_data_path, sentiment_csv)
131
+
132
+ if os.path.exists(sentiment_csv):
133
+ sentiment_df = pd.read_csv(sentiment_csv)
134
+ sentiment_counts = sentiment_df['sentiment'].value_counts().to_dict()
135
+ sentiment_map = {0: "neutral", 1: "positive", 2: "negative"}
136
+ text_counts = {sentiment_map.get(k, k): v for k, v in sentiment_counts.items()}
137
+ sentiment_data = {
138
+ 'counts': text_counts,
139
+ 'table_html': csv_to_html_table(sentiment_csv)
140
+ }
141
+
142
+ # Step 9: Run priority indexing
143
+ cache.set(progress_key, {'stage': 'indexing_priority', 'percent': 90})
144
+ priority_json = os.path.join(report_path, f"{claim_id}_priority.json")
145
+ priority_data = {}
146
+
147
+ if os.path.exists(sentiment_csv):
148
+ priority_indexer.run(
149
+ claim=text,
150
+ claim_id=claim_id,
151
+ keywords=keywords,
152
+ sentiment_csv=sentiment_csv,
153
+ output_path=priority_json
154
+ )
155
+
156
+ if os.path.exists(priority_json):
157
+ with open(priority_json, 'r') as f:
158
+ priority_data = json.load(f)
159
+ verdict = determine_verdict(priority_data)
160
+
161
+ # Step 10: Complete
162
+ cache.set(progress_key, {'stage': 'complete', 'percent': 100})
163
+
164
+ return JsonResponse({
165
+ 'classification': classification_result,
166
+ 'keywords': keywords,
167
+ 'sentiment_data': sentiment_data,
168
+ 'priority_data': priority_data,
169
+ 'verdict': verdict if 'verdict' in locals() else "UNVERIFIED",
170
+ 'progress_key': progress_key
171
+ })
172
+
173
+ except Exception as e:
174
+ logger.error(f"Error in classification: {str(e)}")
175
+ return JsonResponse({
176
+ 'error': str(e),
177
+ 'progress_key': progress_key
178
+ }, status=500)
179
+
180
+ else:
181
+ form = ClassificationForm()
182
+ return render(request, 'classification.html', {
183
+ 'form': form,
184
+ 'result': {}
185
+ })
186
+
187
+ def determine_verdict(priority_data):
188
+ """Determine verdict based on priority data"""
189
+ # Extract priority flags from the data
190
+ if isinstance(priority_data, dict):
191
+ if "priority_flags" in priority_data:
192
+ priority_flags = priority_data["priority_flags"]
193
+ else:
194
+ # Assume the dictionary itself contains the flags
195
+ priority_flags = priority_data
196
+ else:
197
+ return "UNVERIFIED"
198
+
199
+ # Get sentiment counts if available
200
+ sentiment_counts = {}
201
+ if "sentiment_counts" in priority_data:
202
+ sentiment_counts = priority_data["sentiment_counts"]
203
+ # Convert keys to strings if they're not already
204
+ if any(not isinstance(k, str) for k in sentiment_counts.keys()):
205
+ sentiment_counts = {str(k): v for k, v in sentiment_counts.items()}
206
+
207
+ # Get priority score if available
208
+ priority_score = priority_data.get("priority_score", sum(priority_flags.values()))
209
+
210
+ # Get claim and keywords
211
+ claim = priority_data.get("claim", "").lower()
212
+ keywords = priority_data.get("keywords", [])
213
+ keywords_lower = [k.lower() for k in keywords]
214
+
215
+ # Check for specific claim patterns
216
+ is_azan_claim = any(word in claim for word in ["azan", "larang", "masjid", "pembesar suara"])
217
+ is_religious_claim = any(word in claim for word in ["islam", "agama", "masjid", "surau", "sembahyang", "solat", "zakat"])
218
+
219
+ # Check for economic impact
220
+ economic_related = priority_flags.get("economic_impact", 0) == 1
221
+
222
+ # Check for government involvement
223
+ government_related = priority_flags.get("affects_government", 0) == 1
224
+
225
+ # Check for law-related content
226
+ law_related = priority_flags.get("law_related", 0) == 1
227
+
228
+ # Check for confusion potential
229
+ causes_confusion = priority_flags.get("cause_confusion", 0) == 1
230
+
231
+ # Check for negative sentiment dominance
232
+ negative_dominant = False
233
+ if sentiment_counts:
234
+ pos = int(sentiment_counts.get("positive", sentiment_counts.get("1", 0)))
235
+ neg = int(sentiment_counts.get("negative", sentiment_counts.get("2", 0)))
236
+ neu = int(sentiment_counts.get("neutral", sentiment_counts.get("0", 0)))
237
+ negative_dominant = neg > pos and neg > neu
238
+
239
+ # Special case for azan claim (like the example provided)
240
+ if is_azan_claim and is_religious_claim and "larangan" in claim:
241
+ return "FALSE" # Claim about banning azan is false
242
+
243
+ # Determine verdict based on multiple factors
244
+ if priority_score >= 7.0 and negative_dominant and (government_related or law_related):
245
+ return "FALSE"
246
+ elif priority_score >= 5.0 and causes_confusion:
247
+ return "PARTIALLY_TRUE"
248
+ elif priority_score <= 3.0 and not negative_dominant:
249
+ return "TRUE"
250
+ elif economic_related and government_related:
251
+ # Special case for economic policies by government
252
+ if negative_dominant:
253
+ return "FALSE"
254
+ elif causes_confusion:
255
+ return "PARTIALLY_TRUE"
256
+ else:
257
+ return "TRUE"
258
+ else:
259
+ return "UNVERIFIED"
260
+
261
+ def image_profiling(request):
262
+ # import faiss
263
+
264
+ result = None
265
+ image_with_labels = None
266
+ cropped_faces_base64 = []
267
+ texts = None
268
+ proccessed = False
269
+ uploded_base64 = None
270
+ exifs = None
271
+ metadata = None
272
+ description = None
273
+ reverse_images = None
274
+
275
+ if request.method == 'POST':
276
+ form = ImageUploadForm(request.POST, request.FILES)
277
+ if form.is_valid():
278
+ proccessed = True
279
+ uploaded_image = request.FILES['image']
280
+
281
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
282
+ for chunk in uploaded_image.chunks():
283
+ tmp.write(chunk)
284
+ tmp_path = tmp.name
285
+
286
+ image = PILImage.open(uploaded_image)
287
+ image_np = numpy_lib.array(image.convert('RGB'))
288
+ exifs = devlab_image.extract_exif(tmp_path)
289
+ metadata = devlab_image.extract_metadata_exiftool(tmp_path)
290
+ description = devlab_image.generate_description_blip(tmp_path)
291
+ # reverse_images = devlab_image.reverse_search(tmp_path)
292
+
293
+ buffered = io.BytesIO()
294
+ image.save(buffered, format="PNG") # or "JPEG", depending on your image format
295
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
296
+ uploded_base64 = f"data:image/png;base64,{img_str}"
297
+
298
+ texts = devlab_image.extract_text_numpy(image_np)
299
+
300
+
301
+ # Detect face embeddings using DeepFace
302
+ face_embeddings = DeepFace.represent(image_np, model_name="Facenet", enforce_detection=False)
303
+
304
+
305
+ if not face_embeddings:
306
+ return "❌ No faces detected in the image."
307
+
308
+ recognized_faces = {}
309
+ cropped_faces = []
310
+
311
+ for face_data in face_embeddings:
312
+ query_embedding = numpy_lib.array(face_data["embedding"], dtype=numpy_lib.float32).reshape(1, -1)
313
+
314
+ results = devlab_image.query_embedding(query_embedding,1)
315
+ if results and len(results) > 0 and len(results[0]) > 0:
316
+ entity = results[0][0].entity
317
+ print(f"Entity: {entity}") # See what fields are present in the entity
318
+
319
+ face_name = entity.get('name') if entity else 'Unknown'
320
+ fdescription = entity.get('short_description') if entity else ''
321
+ if fdescription is None:
322
+ fdescription = ''
323
+
324
+ distance = round(results[0][0].distance, 4)
325
+
326
+ if distance*100>95:
327
+ face_name = f"{face_name} (CLOSEST)"
328
+ # Store recognized face data
329
+ recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {
330
+ "name": face_name,
331
+ "distance": distance,
332
+ "description": fdescription,
333
+ }
334
+
335
+ # Face location for drawing rectangle and adding label
336
+ face_location = face_data["facial_area"]
337
+ x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
338
+
339
+ # Draw rectangle and label on the image
340
+ # cv2.putText(image_np, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
341
+ cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 255, 0), 2)
342
+
343
+ # Crop the detected face and prepare it for displaying
344
+ cropped_face = image_np[y:y + h, x:x + w]
345
+ cropped_faces.append([cropped_face, face_name, distance, fdescription])
346
+
347
+ # label = f"{face_name} (Dist: {round(distance, 2)})"
348
+
349
+ else:
350
+ print('No result found')
351
+
352
+
353
+
354
+ # Convert the image with labels to base64 for HTML rendering
355
+ _, buffer = cv2.imencode('.png', image_np)
356
+ image_base64 = base64.b64encode(buffer).decode('utf-8')
357
+
358
+ # Convert cropped faces to base64 for displaying in template
359
+ cropped_faces_base64 = []
360
+ for face, face_name, distance, fdescription in cropped_faces:
361
+ _, buffer = cv2.imencode('.png', face)
362
+ face_base64 = base64.b64encode(buffer).decode('utf-8')
363
+ cropped_faces_base64.append([f"data:image/png;base64,{face_base64}",face_name, distance, fdescription])
364
+
365
+ # Prepare result for template rendering
366
+ result = recognized_faces
367
+ image_with_labels = f"data:image/png;base64,{image_base64}"
368
+
369
+
370
+ else:
371
+ form = ImageUploadForm()
372
+
373
+ return render(request, 'image_profiling.html', {
374
+ 'form': form,
375
+ 'proccessed' : proccessed,
376
+ 'uploaded_base64': uploded_base64,
377
+ 'image_with_labels': image_with_labels,
378
+ 'cropped_faces': cropped_faces_base64,
379
+ 'texts': texts,
380
+ 'exifs': exifs,
381
+ 'metadata': metadata,
382
+ 'description': description,
383
+ 'reverse_images': reverse_images
384
+ })
385
+
386
+ # def detect_faces2(request):
387
+ # import faiss
388
+ # import numpy as np
389
+ # import pickle
390
+ # from deepface import DeepFace
391
+ # import cv2
392
+ # import base64
393
+ # from io import BytesIO
394
+ # from PIL import Image
395
+ # import os
396
+
397
+ # result = None
398
+ # image_with_labels = None
399
+ # cropped_faces_base64 = []
400
+
401
+ # if request.method == 'POST':
402
+ # form = ImageUploadForm(request.POST, request.FILES)
403
+ # if form.is_valid():
404
+ # uploaded_image = request.FILES['image']
405
+
406
+ # # Open the uploaded image with Pillow and convert to RGB
407
+ # image = Image.open(uploaded_image).convert('RGB')
408
+ # image_np = numpy_lib.array(image)
409
+
410
+ # # Load FAISS index and metadata
411
+ # save_path = os.path.join(os.path.dirname(__file__), "deepface")
412
+ # try:
413
+ # index = faiss.read_index(save_path + "/faiss_hnsw_index.bin")
414
+ # with open(save_path + "/metadata.pkl", "rb") as f:
415
+ # names = pickle.load(f)
416
+ # except Exception as e:
417
+ # return f"Error loading FAISS index or metadata: {str(e)}"
418
+
419
+ # # Set search parameters for better accuracy in FAISS
420
+ # index.hnsw.efSearch = 100 # Larger = better accuracy, but slower
421
+
422
+ # # Detect face embeddings using DeepFace
423
+ # face_embeddings = DeepFace.represent(image_np, model_name="Facenet", enforce_detection=False)
424
+
425
+ # if not face_embeddings:
426
+ # return "❌ No faces detected in the image."
427
+
428
+ # recognized_faces = {}
429
+ # cropped_faces = []
430
+
431
+ # for face_data in face_embeddings:
432
+ # query_embedding = numpy_lib.array(face_data["embedding"], dtype=numpy_lib.float32).reshape(1, -1)
433
+
434
+ # # Search for the closest matches in the FAISS index
435
+ # D, I = index.search(query_embedding, 1) # D = distances, I = indices
436
+
437
+ # # Get the top match for this face
438
+ # face_name = names[I[0][0]]
439
+ # distance = D[0][0]
440
+
441
+ # # Store recognized face data
442
+ # recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {
443
+ # "name": face_name,
444
+ # "distance": round(distance, 4)
445
+ # }
446
+
447
+ # # Face location for drawing rectangle and adding label
448
+ # face_location = face_data["facial_area"]
449
+ # x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
450
+
451
+ # # Draw rectangle and label on the image
452
+ # # cv2.putText(image_np, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
453
+ # cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 255, 0), 2)
454
+
455
+ # # Crop the detected face and prepare it for displaying
456
+ # cropped_face = image_np[y:y + h, x:x + w]
457
+ # cropped_faces.append([cropped_face, face_name])
458
+
459
+ # label = f"{face_name} (Dist: {round(distance, 4)})"
460
+
461
+
462
+
463
+ # # Convert the image with labels to base64 for HTML rendering
464
+ # _, buffer = cv2.imencode('.png', image_np)
465
+ # image_base64 = base64.b64encode(buffer).decode('utf-8')
466
+
467
+ # # Convert cropped faces to base64 for displaying in template
468
+ # cropped_faces_base64 = []
469
+ # for face,fname in cropped_faces:
470
+ # _, buffer = cv2.imencode('.png', face)
471
+ # face_base64 = base64.b64encode(buffer).decode('utf-8')
472
+ # cropped_faces_base64.append([f"data:image/png;base64,{face_base64}",fname])
473
+
474
+ # # Prepare result for template rendering
475
+ # result = recognized_faces
476
+ # image_with_labels = f"data:image/png;base64,{image_base64}"
477
+
478
+ # else:
479
+ # form = ImageUploadForm()
480
+
481
+ # return render(request, 'face_detection.html', {
482
+ # 'form': form,
483
+ # 'result': result,
484
+ # 'image_with_labels': image_with_labels,
485
+ # 'cropped_faces': cropped_faces_base64 # Pass the list of cropped faces to the template
486
+ # })
487
+
488
+
489
+ def register_face(request):
490
+ from ai_api.library.devlab_image import DevLabImage
491
+ import os
492
+ from django.core.files.storage import FileSystemStorage
493
+ from django.conf import settings
494
+
495
+ result = None
496
+ if request.method == 'POST':
497
+ form = RegisterFaceForm(request.POST)
498
+ person = request.POST.get("person", "").upper()
499
+ keywords = request.POST.get("keywords", "")
500
+ files = request.FILES.getlist('images')
501
+
502
+ devlab_image = DevLabImage()
503
+
504
+
505
+ if files:
506
+ print('Upload manual')
507
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
508
+ upload_dir = os.path.join(project_root, 'people', person)
509
+
510
+ print(f"Saving to: {upload_dir}")
511
+ os.makedirs(upload_dir, exist_ok=True)
512
+
513
+ fs = FileSystemStorage(location=upload_dir)
514
+
515
+ for file in files:
516
+ filename = fs.save(file.name, file)
517
+ file_url = fs.url(filename)
518
+ print(f"Saved: {file_url}")
519
+ devlab_image.extract_face( person, keywords)
520
+ else:
521
+ print('Download from Google')
522
+ devlab_image.register_person(person, keywords)
523
+
524
+
525
+ else:
526
+ form = RegisterFaceForm()
527
+
528
+
529
+ return render(request, 'register_face.html', {
530
+ 'form': form,
531
+ 'result': result,
532
+ })
533
+
534
+ def check_progress(request, key):
535
+ # print(f"getting progress key {key}")
536
+ progress = cache.get(key, {'stage': 'downloading', 'percent': 0})
537
+ # print(progress)
538
+ return JsonResponse(progress)
539
+
540
+ def handle_uploaded_file(file):
541
+ mime_type, _ = mimetypes.guess_type(file.name)
542
+
543
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
544
+ output_audio_file = temp_audio_file.name
545
+
546
+ if mime_type and mime_type.startswith('video'):
547
+ # Save video temporarily
548
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[-1]) as temp_video_file:
549
+ for chunk in file.chunks():
550
+ temp_video_file.write(chunk)
551
+ video_path = temp_video_file.name
552
+
553
+ # Extract audio using ffmpeg
554
+ command = [
555
+ 'ffmpeg',
556
+ '-y',
557
+ '-i', video_path,
558
+ '-vn', # no video
559
+ '-acodec', 'pcm_s16le', # WAV format
560
+ '-ar', '16000', # 16 kHz sample rate
561
+ '-ac', '1', # Mono channel
562
+ output_audio_file
563
+ ]
564
+
565
+ try:
566
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
567
+ print("FFmpeg stderr:", result.stderr.decode())
568
+
569
+ except subprocess.CalledProcessError as e:
570
+ logger.error(f"ffmpeg failed with error: {e.stderr.decode()}")
571
+ raise Exception(f"Audio extraction failed: {e.stderr.decode()}")
572
+
573
+ # Clean up temporary video file
574
+ os.remove(video_path)
575
+
576
+ else:
577
+ # If audio, save it directly
578
+ with open(output_audio_file, 'wb') as f:
579
+ for chunk in file.chunks():
580
+ f.write(chunk)
581
+
582
+ return output_audio_file
583
+
584
+ def format_time(seconds):
585
+ # Convert seconds to WebVTT time format (hh:mm:ss.mmm)
586
+ m, s = divmod(seconds, 60)
587
+ h, m = divmod(m, 60)
588
+ ms = int((s - int(s)) * 1000) # Milliseconds
589
+ return f"{int(h):02}:{int(m):02}:{int(s):02}.{ms:03}"
590
+
591
+ def generate_vtt(segments):
592
+ # Generate the VTT content from the Whisper segments
593
+ vtt_content = "WEBVTT\n\n"
594
+
595
+ for segment in segments:
596
+ start_time = segment['start']
597
+ end_time = segment['end']
598
+ text = segment['text']
599
+
600
+ # Convert seconds to WebVTT time format
601
+ start_time_str = format_time(start_time)
602
+ end_time_str = format_time(end_time)
603
+
604
+ vtt_content += f"{start_time_str} --> {end_time_str}\n{text}\n\n"
605
+
606
+ return vtt_content
607
+
608
+ def save_vtt(output_audio_file, vtt):
609
+ base_name = os.path.splitext(os.path.basename(output_audio_file))[0]
610
+ new_filename = base_name + ".vtt"
611
+
612
+ final_path = os.path.join(settings.MEDIA_ROOT, 'vtt', new_filename)
613
+ os.makedirs(os.path.dirname(final_path), exist_ok=True)
614
+
615
+ with open(final_path, "w", encoding="utf-8") as f:
616
+ f.write(vtt)
617
+
618
+ return final_path
619
+
620
+ def transcription(request):
621
+
622
+
623
+ transcription = None
624
+ error = None
625
+ progress_key = str(uuid.uuid4())
626
+
627
+ if request.method == "POST":
628
+
629
+ progress_key = request.POST.get("progress_key", progress_key)
630
+
631
+ model = globals.whisper_model
632
+ form = YouTubeURLForm(request.POST)
633
+
634
+ #if form.is_valid():
635
+ file = request.FILES.get('file')
636
+ if file:
637
+ # with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
638
+ # for chunk in file.chunks():
639
+ # temp_file.write(chunk)
640
+ # output_audio_file = temp_file.name
641
+ output_audio_file = handle_uploaded_file(file)
642
+ if os.path.getsize(output_audio_file) == 0:
643
+ raise RuntimeError("FFmpeg produced an empty audio file.")
644
+
645
+ print(f"transcribing : {output_audio_file}")
646
+ cache.set(progress_key, {'stage': 'transcribing', 'percent': 100})
647
+ result = model.transcribe(output_audio_file,verbose=False)
648
+ vtt = generate_vtt(result['segments'])
649
+ vtt_file = save_vtt(output_audio_file, vtt)
650
+
651
+
652
+ else:
653
+ cache.set(progress_key, {'stage': 'downloading', 'percent': 0})
654
+ ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
655
+
656
+ def progress_hook(d):
657
+ # print(f"status {d['status']}")
658
+ if d['status'] == 'downloading':
659
+ # print(d)
660
+ percent_str = d.get('_percent_str', '0%').strip()
661
+ clean_str = ansi_escape.sub('', percent_str).strip()
662
+ # print(f"clean percent_str: {repr(clean_str)}") # e.g. '100.0%'
663
+
664
+ try:
665
+ match = re.search(r'(\d+(?:\.\d+)?)', clean_str)
666
+ if match:
667
+ percent = float(match.group(1))
668
+ else:
669
+ print("❌ Regex didn't match!")
670
+ percent = 0
671
+ except Exception as e:
672
+ print(f"❌ Error parsing percent: {e}")
673
+ percent = 0
674
+
675
+ # print(f"✅ current progress for {progress_key} is: {percent}")
676
+ cache.set(progress_key, {'stage': 'downloading', 'percent': percent})
677
+
678
+ url = request.POST.get('url')
679
+ unique_id = str(uuid.uuid4())
680
+ temp_dir = tempfile.gettempdir()
681
+ base_filename = f"temp_{unique_id}"
682
+ download_path = f"{temp_dir}/{base_filename}.%(ext)s"
683
+ # print(f"download_path: {download_path}")
684
+ output_audio_file = f"{temp_dir}/{base_filename}.mp3"
685
+
686
+ ydl_opts = {
687
+ 'format': 'bestaudio/best',
688
+ 'outtmpl': download_path, # No fixed extension!
689
+ 'postprocessors': [{
690
+ 'key': 'FFmpegExtractAudio',
691
+ 'preferredcodec': 'mp3',
692
+ 'preferredquality': '192',
693
+ }],
694
+ 'progress_hooks': [progress_hook],
695
+ 'quiet': True,
696
+ 'no_warnings': True,
697
+ 'noplaylist': True,
698
+ }
699
+ print(f"downloading : {url}")
700
+ try:
701
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
702
+ ydl.download([url])
703
+ print(f"transcribing : {output_audio_file}")
704
+ cache.set(progress_key, {'stage': 'transcribing', 'percent': 100})
705
+ result = model.transcribe(output_audio_file,verbose=False)
706
+ vtt = generate_vtt(result['segments'])
707
+ vtt_file = save_vtt(output_audio_file,vtt)
708
+ except Exception as e:
709
+ error = str(e)
710
+
711
+
712
+ # transcription = result['text']
713
+
714
+ # audio = AudioSegment.from_file(output_audio_file)
715
+ # chunk_length_ms = 60 * 1000 # 1-minute chunks
716
+ # chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
717
+ # results = []
718
+ # total_chunks = len(chunks)
719
+ # cache.set(progress_key, {'stage': 'transcribing', 'percent': 0})
720
+
721
+ # for i, chunk in enumerate(chunks):
722
+ # temp_filename = f"temp_chunk_{i}.wav"
723
+ # chunk.export(temp_filename, format="wav")
724
+
725
+ # result = model.transcribe(temp_filename, verbose=False)
726
+ # results.append(result["text"])
727
+
728
+ # os.remove(temp_filename)
729
+
730
+ # # Update progress
731
+ # percent = int((i + 1) / total_chunks * 100)
732
+ # cache.set(progress_key, {'stage': 'transcribing', 'percent': percent})
733
+
734
+ # # Combine all chunk texts
735
+ # transcription = "\n".join(results)
736
+
737
+
738
+ cache.set(progress_key, {'stage': 'done', 'percent': 100})
739
+
740
+ filename = os.path.basename(output_audio_file)
741
+ final_path = os.path.join(settings.MEDIA_ROOT, 'uploads', filename)
742
+ os.makedirs(os.path.dirname(final_path), exist_ok=True)
743
+ shutil.move(output_audio_file, final_path)
744
+
745
+ # Public URL
746
+
747
+
748
+ file_url = settings.MEDIA_URL + 'uploads/' + filename
749
+ audio_html = f'<audio controls><source src="{file_url}" type="audio/wav">Your browser does not support the audio element.</audio>'
750
+
751
+
752
+ return JsonResponse({'text': result['text'], 'segments': result['segments'], 'audio_file': audio_html })
753
+ # if os.path.exists(output_audio_file):
754
+ # os.remove(output_audio_file)
755
+
756
+
757
+ # return render(request, 'transcription.html', {
758
+ # 'form': form,
759
+ # 'transcription': transcription,
760
+ # 'error': error,
761
+ # 'progress_key': progress_key,
762
+ # })
763
+
764
+ else:
765
+ form = TranscribeForm()
766
+
767
+ return render(request, 'transcription.html', {
768
+ 'form': form,
769
+ 'transcription': transcription,
770
+ 'error': error,
771
+ 'progress_key': progress_key,
772
+ })
773
+
774
+ def csv_to_html_table(filepath):
775
+ def is_valid_url(url):
776
+ # URL pattern matching - must start with http:// or https://
777
+ url_pattern = re.compile(
778
+ r'^https?://' # must start with http:// or https://
779
+ r'([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+' # domain
780
+ r'[a-zA-Z]{2,}' # TLD
781
+ r'(/[a-zA-Z0-9-._~:/?#[\]@!$&\'()*+,;=]*)?$' # path and query
782
+ )
783
+ return bool(url_pattern.match(url))
784
+
785
+ html = '<table id="dataset" class="table table-bordered mt-2 smaller">'
786
+ with open(filepath, newline='') as csvfile:
787
+ reader = csv.reader(csvfile)
788
+ for i, row in enumerate(reader):
789
+ if i == 0:
790
+ html += '<thead>'
791
+ html += "<tr>" + "".join(f"<th>{col}</th>" for col in row) + "</tr>"
792
+ html += '</thead>'
793
+ else:
794
+ html += "<tr>" + "".join(
795
+ f'<td><a href="{col}" target="_blank" rel="noopener noreferrer">{col}</a></td>' if is_valid_url(col) else f"<td>{col}</td>"
796
+ for col in row
797
+ ) + "</tr>"
798
+ html += "</table>"
799
+ return html
ai_api/widgets.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from django.forms.widgets import ClearableFileInput
2
+
3
+ class MultipleFileInput(ClearableFileInput):
4
+ allow_multiple_selected = True
5
+
csv_people.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+
4
+ # Path to the folder you want to scan
5
+ folder_path = 'people'
6
+
7
+ # Get all subfolder names
8
+ subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
9
+
10
+ # Path to the output CSV file
11
+ csv_file = 'subfolders.csv'
12
+
13
+ # Write the subfolder names to the CSV file
14
+ with open(csv_file, mode='w', newline='') as file:
15
+ writer = csv.writer(file)
16
+ writer.writerow(['Subfolder Name']) # Write the header
17
+ for subfolder in subfolders:
18
+ writer.writerow([subfolder]) # Write each subfolder name
19
+
20
+ print(f"Subfolder names have been written to {csv_file}")
delete_milvus.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymilvus import Collection, connections
2
+ from dotenv import load_dotenv
3
+ import os
4
+ load_dotenv()
5
+
6
+
7
+ milvus_host = os.getenv("MILVUS_HOST", "localhost") # default localhost
8
+ milvus_port = os.getenv("MILVUS_PORT", "19530") # default 19530
9
+
10
+ connections.connect("default", host=milvus_host, port=int(milvus_port))
11
+
12
+
13
+ # Now, connect to the collection
14
+ collection = Collection("faces")
15
+
16
+ # Query the collection to find entries where the 'name' field is empty or None
17
+ query = 'name == "YAB DATO SERI ANWAR IBRAHIM"' # Looking for entities where 'name' is empty
18
+
19
+ # Perform the query to find entities with empty 'name' fields
20
+ results = collection.query(query, output_fields=["id", "name"])
21
+
22
+ # Check and delete entities with empty 'name'
23
+ if results:
24
+ ids_to_delete = [str(result["id"]) for result in results]
25
+ id_expr = f"id in [{', '.join(ids_to_delete)}]"
26
+ collection.delete(expr=id_expr)
27
+ print(f"✅ Deleted entities: {ids_to_delete}")
28
+ else:
29
+ print("❌ No entities found for deletion.")
devlab_next/.gitignore ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python bytecode files
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ __pycache__/
6
+
7
+ # Virtual environment
8
+ venv/
9
+ env/
10
+
11
+ # Distribution / packaging
12
+ *.egg
13
+ *.egg-info
14
+ dist/
15
+ build/
16
+ *.whl
17
+
18
+ # IDE files
19
+ .idea/
20
+ .vscode/
21
+
22
+ # Jupyter Notebook files
23
+ .ipynb_checkpoints
24
+
25
+ # PyInstaller
26
+ *.manifest
27
+ *.spec
28
+
29
+ # Test and coverage reports
30
+ .coverage
31
+ *.coveragerc
32
+ nosetests.xml
33
+ coverage.xml
34
+ *.coveralls.yml
35
+
36
+ # MyPy
37
+ .mypy_cache/
38
+ .dmypy.json
39
+ dmypy.json
40
+
41
+ # Pytest
42
+ .cache/
43
+
44
+ # Sphinx documentation
45
+ docs/_build/
46
+
47
+ # pytest and flake8
48
+ *.log
49
+
50
+ # VS Code settings
51
+ .vscode/
52
+
53
+ # Django secrets
54
+ *.env
55
+
56
+ # Flask instance folder
57
+ instance/
58
+
59
+ # PyCharm project files
60
+ .idea/
61
+
62
+ # Other Python-related files
63
+ *.bak
64
+ *.swp
65
+ *.swo
66
+ ddet_classification/
67
+ .DS_Store
68
+ .pkl
devlab_next/__init__.py ADDED
File without changes
devlab_next/asgi.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASGI config for devlab_next project.
3
+
4
+ It exposes the ASGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.asgi import get_asgi_application
13
+
14
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'devlab_next.settings')
15
+
16
+ application = get_asgi_application()
devlab_next/settings.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Django settings for devlab_next project.
3
+
4
+ Generated by 'django-admin startproject' using Django 4.2.7.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/4.2/topics/settings/
8
+
9
+ For the full list of settings and their values, see
10
+ https://docs.djangoproject.com/en/4.2/ref/settings/
11
+ """
12
+
13
+ from pathlib import Path
14
+ import os
15
+
16
+ # Build paths inside the project like this: BASE_DIR / 'subdir'.
17
+ BASE_DIR = Path(__file__).resolve().parent.parent
18
+
19
+
20
+ # Quick-start development settings - unsuitable for production
21
+ # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
22
+
23
+ # SECURITY WARNING: keep the secret key used in production secret!
24
+ SECRET_KEY = 'django-insecure-5a87e9*^s30hb+%+h@t^06493w2tpv7w6%+(0!#iu77b%*8=#i'
25
+
26
+ # SECURITY WARNING: don't run with debug turned on in production!
27
+ DEBUG = True
28
+
29
+ ALLOWED_HOSTS = ['127.0.0.1','fctestbed.bernama.com','localhost']
30
+
31
+
32
+ # Application definition
33
+
34
+ INSTALLED_APPS = [
35
+ 'django.contrib.admin',
36
+ 'django.contrib.auth',
37
+ 'django.contrib.contenttypes',
38
+ 'django.contrib.sessions',
39
+ 'django.contrib.messages',
40
+ 'django.contrib.staticfiles',
41
+ 'rest_framework',
42
+ # 'ai_api',
43
+ 'ai_api.apps.AiApiConfig',
44
+ ]
45
+
46
+ MIDDLEWARE = [
47
+ 'django.middleware.security.SecurityMiddleware',
48
+ 'django.contrib.sessions.middleware.SessionMiddleware',
49
+ 'django.middleware.common.CommonMiddleware',
50
+ 'django.middleware.csrf.CsrfViewMiddleware',
51
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
52
+ 'django.contrib.messages.middleware.MessageMiddleware',
53
+ 'django.middleware.clickjacking.XFrameOptionsMiddleware',
54
+ # 'ai_api.middleware.HMACAuthMiddleware'
55
+ ]
56
+
57
+ ROOT_URLCONF = 'devlab_next.urls'
58
+
59
+ TEMPLATES = [
60
+ {
61
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
62
+ 'DIRS': [],
63
+ 'APP_DIRS': True,
64
+ 'OPTIONS': {
65
+ 'context_processors': [
66
+ 'django.template.context_processors.debug',
67
+ 'django.template.context_processors.request',
68
+ 'django.contrib.auth.context_processors.auth',
69
+ 'django.contrib.messages.context_processors.messages',
70
+ ],
71
+ },
72
+ },
73
+ ]
74
+
75
+ WSGI_APPLICATION = 'devlab_next.wsgi.application'
76
+
77
+
78
+ # Database
79
+ # https://docs.djangoproject.com/en/4.2/ref/settings/#databases
80
+
81
+ # DATABASES = {
82
+ # 'default': {
83
+ # 'ENGINE': 'django.db.backends.sqlite3',
84
+ # 'NAME': BASE_DIR / 'db.sqlite3',
85
+ # }
86
+ # }
87
+
88
+ DATABASES = {
89
+ "default": {
90
+ "ENGINE": "django.db.backends.postgresql",
91
+ "NAME": os.environ.get("DB_NAME", "factcheckapidb"),
92
+ "USER": os.environ.get("DB_USER", "postgres"),
93
+ "PASSWORD": os.environ.get("DB_PASSWORD", "postgres"),
94
+ "HOST": os.environ.get("DB_HOST", "127.0.0.1"),
95
+ "PORT": os.environ.get("DB_PORT", "5432"),
96
+ }
97
+ }
98
+
99
+
100
+
101
+ # Password validation
102
+ # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
103
+
104
+ AUTH_PASSWORD_VALIDATORS = [
105
+ {
106
+ 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
107
+ },
108
+ {
109
+ 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
110
+ },
111
+ {
112
+ 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
113
+ },
114
+ {
115
+ 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
116
+ },
117
+ ]
118
+
119
+
120
+ # Internationalization
121
+ # https://docs.djangoproject.com/en/4.2/topics/i18n/
122
+
123
+ LANGUAGE_CODE = 'en-us'
124
+
125
+ TIME_ZONE = 'UTC'
126
+
127
+ USE_I18N = True
128
+
129
+ USE_TZ = True
130
+
131
+
132
+ # Static files (CSS, JavaScript, Images)
133
+ # https://docs.djangoproject.com/en/4.2/howto/static-files/
134
+
135
+ STATIC_URL = '/static/'
136
+ # STATIC_ROOT = BASE_DIR / 'static/'
137
+
138
+ STATICFILES_DIRS = [
139
+ os.path.join(BASE_DIR, 'static'),
140
+ ]
141
+
142
+
143
+
144
+ # Default primary key field type
145
+ # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
146
+
147
+ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
148
+
149
+ MEDIA_URL = '/media/'
150
+ MEDIA_ROOT = BASE_DIR / 'media'
151
+
152
+ CACHES = {
153
+ 'default': {
154
+ 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', # In-memory
155
+ 'LOCATION': 'progress-cache',
156
+ }
157
+ }
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
devlab_next/urls.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL configuration for devlab_next project.
3
+
4
+ The `urlpatterns` list routes URLs to views. For more information please see:
5
+ https://docs.djangoproject.com/en/4.2/topics/http/urls/
6
+ Examples:
7
+ Function views
8
+ 1. Add an import: from my_app import views
9
+ 2. Add a URL to urlpatterns: path('', views.home, name='home')
10
+ Class-based views
11
+ 1. Add an import: from other_app.views import Home
12
+ 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
13
+ Including another URLconf
14
+ 1. Import the include() function: from django.urls import include, path
15
+ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
16
+ """
17
+ from django.contrib import admin
18
+ from django.urls import path, include
19
+ from django.conf import settings
20
+ from django.conf.urls.static import static
21
+ import os
22
+
23
+ admin.site.site_header = "BERNAMA Fact Check"
24
+ admin.site.site_title = "BERNAMA Fact Check Portal"
25
+ admin.site.index_title = "Dashboard"
26
+
27
+ urlpatterns = [
28
+ path('admin/', admin.site.urls),
29
+ path('', include('ai_api.urls')),
30
+ path('api/v1/', include('ai_api.api_urls')),
31
+ ]+ static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
32
+
33
+ urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
devlab_next/wsgi.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WSGI config for devlab_next project.
3
+
4
+ It exposes the WSGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.wsgi import get_wsgi_application
13
+
14
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'devlab_next.settings')
15
+
16
+ application = get_wsgi_application()
docker-compose.yml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.5'
2
+
3
+ services:
4
+ web:
5
+ build: .
6
+ container_name: django_app
7
+ mem_limit: 16g
8
+ command: gunicorn devlab_next.wsgi:application --bind 0.0.0.0:8000 --workers 3 --log-level debug
9
+ volumes:
10
+ - .:/app
11
+ ports:
12
+ - "8000:8000"
13
+ depends_on:
14
+ - milvus-standalone
15
+ environment:
16
+ - DJANGO_SETTINGS_MODULE=devlab_next.settings
17
+ - TF_CPP_MIN_LOG_LEVEL=2
18
+ networks:
19
+ - milvus_network
20
+
21
+ milvus-standalone:
22
+ container_name: milvus
23
+ image: milvusdb/milvus:v2.5.8
24
+ command: ["milvus", "run", "standalone"]
25
+ security_opt:
26
+ - seccomp:unconfined
27
+ restart: always
28
+ ports:
29
+ - "19530:19530" # gRPC
30
+ - "19121:19121" # HTTP (correct health port)
31
+ volumes:
32
+ - ./volumes/milvus:/var/lib/milvus
33
+ healthcheck:
34
+ test: ["CMD", "curl", "-f", "http://localhost:19121/healthz"]
35
+ interval: 30s
36
+ start_period: 90s
37
+ timeout: 20s
38
+ retries: 3
39
+ depends_on:
40
+ - etcd
41
+ - minio
42
+ environment:
43
+ ETCD_ENDPOINTS: etcd:2379
44
+ MINIO_ADDRESS: minio:9000
45
+ MINIO_ACCESS_KEY: minioadmin
46
+ MINIO_SECRET_KEY: minioadmin
47
+ MILVUS_LOG_LEVEL: debug
48
+ networks:
49
+ - milvus_network
50
+
51
+ etcd:
52
+ image: quay.io/coreos/etcd:v3.5.18
53
+ container_name: etcd
54
+ command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
55
+ environment:
56
+ - ETCD_AUTO_COMPACTION_MODE=revision
57
+ - ETCD_AUTO_COMPACTION_RETENTION=1000
58
+ - ETCD_QUOTA_BACKEND_BYTES=4294967296
59
+ - ETCD_SNAPSHOT_COUNT=50000
60
+ volumes:
61
+ - ./volumes/etcd:/etcd
62
+ healthcheck:
63
+ test: ["CMD", "etcdctl", "endpoint", "health"]
64
+ interval: 30s
65
+ timeout: 20s
66
+ retries: 3
67
+ ports:
68
+ - "2379:2379"
69
+ - "2380:2380"
70
+ networks:
71
+ - milvus_network
72
+
73
+ minio:
74
+ container_name: minio
75
+ image: minio/minio:RELEASE.2023-03-20T20-16-18Z
76
+ environment:
77
+ MINIO_ACCESS_KEY: minioadmin
78
+ MINIO_SECRET_KEY: minioadmin
79
+ command: minio server /minio_data --console-address ":9001"
80
+ ports:
81
+ - "9000:9000"
82
+ - "9001:9001"
83
+ volumes:
84
+ - ./volumes/minio:/minio_data
85
+ healthcheck:
86
+ test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
87
+ interval: 30s
88
+ timeout: 20s
89
+ retries: 3
90
+ networks:
91
+ - milvus_network
92
+
93
+ networks:
94
+ milvus_network:
95
+ driver: bridge
download_people.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ai_api.library.devlab_image import DevLabImage
2
+ import csv
3
+
4
+ devlab_image = DevLabImage()
5
+
6
+ # # Open and read the CSV file
7
+ with open("subfolders.csv", mode="r", encoding="utf-8") as file:
8
+ reader = csv.reader(file)
9
+ for row in reader:
10
+ print(row[0], row[1]) # Each row is a list
11
+ devlab_image.register_person(row[0],row[1])
12
+
13
+ # field_value = input("Enter the name: ")
14
+ # devlab_image.download_person_images(field_value.upper())
list_faces.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymilvus import Collection, connections
2
+ from dotenv import load_dotenv
3
+ import os
4
+ load_dotenv()
5
+
6
+
7
+ milvus_host = os.getenv("MILVUS_HOST", "localhost") # default localhost
8
+ milvus_port = os.getenv("MILVUS_PORT", "19530") # default 19530
9
+
10
+ connections.connect("default", host=milvus_host, port=int(milvus_port))
11
+
12
+ # Now, connect to the collection
13
+ collection = Collection("faces")
14
+
15
+ # Query expression that retrieves all documents with a non-null 'id' (or use any valid field)
16
+ query = "id IS NOT NULL" # Valid query expression to fetch all documents
17
+
18
+ # Retrieve all documents, adjust fields based on your collection schema
19
+ results = collection.query(query, output_fields=["id", "name"])
20
+
21
+ # Print all results
22
+ for result in results:
23
+ print(f"ID: {result['id']}, Name: {result.get('name', 'N/A')}")