Commit
·
090987a
0
Parent(s):
initial
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +9 -0
- .gitignore +82 -0
- Dockerfile +32 -0
- ai_api/.gitignore +68 -0
- ai_api/__init__.py +0 -0
- ai_api/admin.py +10 -0
- ai_api/api.py +44 -0
- ai_api/api_urls.py +10 -0
- ai_api/apps.py +63 -0
- ai_api/controllers/__init__.py +2 -0
- ai_api/controllers/classification.py +15 -0
- ai_api/controllers/transcription.py +16 -0
- ai_api/forms.py +86 -0
- ai_api/globals.py +6 -0
- ai_api/library/apify_scraper.py +893 -0
- ai_api/library/config.py +131 -0
- ai_api/library/devlab_image.py +487 -0
- ai_api/library/lowyat_crawler.py +714 -0
- ai_api/library/priority_indexer.py +360 -0
- ai_api/library/sentiment_analyzer.py +91 -0
- ai_api/library/simple_keyword_extraction.py +205 -0
- ai_api/library/websearch.py +237 -0
- ai_api/middleware.py +40 -0
- ai_api/migrations/0001_initial.py +24 -0
- ai_api/migrations/__init__.py +0 -0
- ai_api/models.py +18 -0
- ai_api/request_serializer.py +30 -0
- ai_api/templates/base-copy.html +35 -0
- ai_api/templates/base.html +61 -0
- ai_api/templates/classification.html +142 -0
- ai_api/templates/home-copy.html +38 -0
- ai_api/templates/home.html +60 -0
- ai_api/templates/image_profiling.html +122 -0
- ai_api/templates/register_face.html +42 -0
- ai_api/templates/transcription.html +159 -0
- ai_api/tests.py +3 -0
- ai_api/urls.py +12 -0
- ai_api/views.py +799 -0
- ai_api/widgets.py +5 -0
- csv_people.py +20 -0
- delete_milvus.py +29 -0
- devlab_next/.gitignore +68 -0
- devlab_next/__init__.py +0 -0
- devlab_next/asgi.py +16 -0
- devlab_next/settings.py +166 -0
- devlab_next/urls.py +33 -0
- devlab_next/wsgi.py +16 -0
- docker-compose.yml +95 -0
- download_people.py +14 -0
- list_faces.py +23 -0
.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
*.db
|
| 6 |
+
venv/
|
| 7 |
+
.git/
|
| 8 |
+
nohup.out
|
| 9 |
+
core
|
.gitignore
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python bytecode files
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
__pycache__/
|
| 6 |
+
|
| 7 |
+
# Virtual environment
|
| 8 |
+
venv/
|
| 9 |
+
env/
|
| 10 |
+
|
| 11 |
+
# Distribution / packaging
|
| 12 |
+
*.egg
|
| 13 |
+
*.egg-info
|
| 14 |
+
dist/
|
| 15 |
+
build/
|
| 16 |
+
*.whl
|
| 17 |
+
|
| 18 |
+
# IDE files
|
| 19 |
+
.idea/
|
| 20 |
+
.vscode/
|
| 21 |
+
|
| 22 |
+
# Jupyter Notebook files
|
| 23 |
+
.ipynb_checkpoints
|
| 24 |
+
|
| 25 |
+
# PyInstaller
|
| 26 |
+
*.manifest
|
| 27 |
+
*.spec
|
| 28 |
+
|
| 29 |
+
# Test and coverage reports
|
| 30 |
+
.coverage
|
| 31 |
+
*.coveragerc
|
| 32 |
+
nosetests.xml
|
| 33 |
+
coverage.xml
|
| 34 |
+
*.coveralls.yml
|
| 35 |
+
|
| 36 |
+
# MyPy
|
| 37 |
+
.mypy_cache/
|
| 38 |
+
.dmypy.json
|
| 39 |
+
dmypy.json
|
| 40 |
+
|
| 41 |
+
# Pytest
|
| 42 |
+
.cache/
|
| 43 |
+
|
| 44 |
+
# Sphinx documentation
|
| 45 |
+
docs/_build/
|
| 46 |
+
|
| 47 |
+
# pytest and flake8
|
| 48 |
+
*.log
|
| 49 |
+
|
| 50 |
+
# VS Code settings
|
| 51 |
+
.vscode/
|
| 52 |
+
|
| 53 |
+
# Django secrets
|
| 54 |
+
*.env
|
| 55 |
+
|
| 56 |
+
# Flask instance folder
|
| 57 |
+
instance/
|
| 58 |
+
|
| 59 |
+
# PyCharm project files
|
| 60 |
+
.idea/
|
| 61 |
+
|
| 62 |
+
# Other Python-related files
|
| 63 |
+
*.bak
|
| 64 |
+
*.swp
|
| 65 |
+
*.swo
|
| 66 |
+
ddet_classification/
|
| 67 |
+
.DS_Store
|
| 68 |
+
.pkl
|
| 69 |
+
people/
|
| 70 |
+
people_backup/
|
| 71 |
+
*.mp3
|
| 72 |
+
*.wav
|
| 73 |
+
media/uploads/
|
| 74 |
+
media/vtt/
|
| 75 |
+
volumes/
|
| 76 |
+
output/
|
| 77 |
+
reports/
|
| 78 |
+
data/
|
| 79 |
+
ai_api/library/data/
|
| 80 |
+
ai_api/library/output/
|
| 81 |
+
ai_api/library/cache/
|
| 82 |
+
ai_api/library/reports/
|
Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 5 |
+
ENV TF_CPP_MIN_LOG_LEVEL=2
|
| 6 |
+
|
| 7 |
+
# Install dependencies
|
| 8 |
+
#RUN apt-get update && apt-get install -y exiftool ffmpeg curl libglib2.0-0 libsm6 libxext6 libxrender-dev
|
| 9 |
+
# Install Chrome & dependencies
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
wget unzip curl gnupg exiftool ffmpeg \
|
| 12 |
+
fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libatk1.0-0 libcups2 libdbus-1-3 libgdk-pixbuf2.0-0 \
|
| 13 |
+
libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1 \
|
| 14 |
+
chromium chromium-driver \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Set work directory
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# Copy project files
|
| 21 |
+
COPY . /app
|
| 22 |
+
|
| 23 |
+
# Install Python packages
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# Expose port
|
| 27 |
+
EXPOSE 8000
|
| 28 |
+
|
| 29 |
+
# Run app using Gunicorn
|
| 30 |
+
#CMD ["gunicorn", "--bind", "0.0.0.0:8000", "devlab_next.wsgi:application"]
|
| 31 |
+
CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]
|
| 32 |
+
|
ai_api/.gitignore
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python bytecode files
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
__pycache__/
|
| 6 |
+
|
| 7 |
+
# Virtual environment
|
| 8 |
+
venv/
|
| 9 |
+
env/
|
| 10 |
+
|
| 11 |
+
# Distribution / packaging
|
| 12 |
+
*.egg
|
| 13 |
+
*.egg-info
|
| 14 |
+
dist/
|
| 15 |
+
build/
|
| 16 |
+
*.whl
|
| 17 |
+
|
| 18 |
+
# IDE files
|
| 19 |
+
.idea/
|
| 20 |
+
.vscode/
|
| 21 |
+
|
| 22 |
+
# Jupyter Notebook files
|
| 23 |
+
.ipynb_checkpoints
|
| 24 |
+
|
| 25 |
+
# PyInstaller
|
| 26 |
+
*.manifest
|
| 27 |
+
*.spec
|
| 28 |
+
|
| 29 |
+
# Test and coverage reports
|
| 30 |
+
.coverage
|
| 31 |
+
*.coveragerc
|
| 32 |
+
nosetests.xml
|
| 33 |
+
coverage.xml
|
| 34 |
+
*.coveralls.yml
|
| 35 |
+
|
| 36 |
+
# MyPy
|
| 37 |
+
.mypy_cache/
|
| 38 |
+
.dmypy.json
|
| 39 |
+
dmypy.json
|
| 40 |
+
|
| 41 |
+
# Pytest
|
| 42 |
+
.cache/
|
| 43 |
+
|
| 44 |
+
# Sphinx documentation
|
| 45 |
+
docs/_build/
|
| 46 |
+
|
| 47 |
+
# pytest and flake8
|
| 48 |
+
*.log
|
| 49 |
+
|
| 50 |
+
# VS Code settings
|
| 51 |
+
.vscode/
|
| 52 |
+
|
| 53 |
+
# Django secrets
|
| 54 |
+
*.env
|
| 55 |
+
|
| 56 |
+
# Flask instance folder
|
| 57 |
+
instance/
|
| 58 |
+
|
| 59 |
+
# PyCharm project files
|
| 60 |
+
.idea/
|
| 61 |
+
|
| 62 |
+
# Other Python-related files
|
| 63 |
+
*.bak
|
| 64 |
+
*.swp
|
| 65 |
+
*.swo
|
| 66 |
+
ddet_classification/
|
| 67 |
+
.DS_Store
|
| 68 |
+
.pkl
|
ai_api/__init__.py
ADDED
|
File without changes
|
ai_api/admin.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.contrib import admin
|
| 2 |
+
from .models import APIClient
|
| 3 |
+
|
| 4 |
+
# admin.site.register(APIClient)
|
| 5 |
+
|
| 6 |
+
@admin.register(APIClient)
|
| 7 |
+
class APIClientAdmin(admin.ModelAdmin):
|
| 8 |
+
list_display = ('name', 'client_id', 'created_at')
|
| 9 |
+
readonly_fields = ('client_id', 'secret_key', 'created_at')
|
| 10 |
+
fields = ('name', 'client_id', 'secret_key', 'created_at') # show in form
|
ai_api/api.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.shortcuts import render
|
| 2 |
+
from django.http import JsonResponse
|
| 3 |
+
from .forms import ImageUploadForm, ClassificationForm, RegisterFaceForm,TranscribeForm, YouTubeURLForm
|
| 4 |
+
import shutil
|
| 5 |
+
from django.conf import settings
|
| 6 |
+
import torch
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
from PIL import Image as PILImage
|
| 10 |
+
import io
|
| 11 |
+
import tempfile
|
| 12 |
+
from django.core.cache import cache
|
| 13 |
+
import numpy as numpy_lib
|
| 14 |
+
import pickle
|
| 15 |
+
from deepface import DeepFace
|
| 16 |
+
import cv2
|
| 17 |
+
import base64
|
| 18 |
+
from io import BytesIO
|
| 19 |
+
from . import globals
|
| 20 |
+
import tempfile
|
| 21 |
+
import mimetypes
|
| 22 |
+
import subprocess
|
| 23 |
+
import logging
|
| 24 |
+
import uuid
|
| 25 |
+
import yt_dlp
|
| 26 |
+
import time
|
| 27 |
+
import re
|
| 28 |
+
from pydub import AudioSegment
|
| 29 |
+
import pandas as pd
|
| 30 |
+
import csv
|
| 31 |
+
from .models import APIClient
|
| 32 |
+
|
| 33 |
+
API_VERSION = '1.0.0'
|
| 34 |
+
|
| 35 |
+
def index(request):
|
| 36 |
+
return JsonResponse({'message': 'Welcome to the BERNAMA Fact Check API', 'version': API_VERSION})
|
| 37 |
+
|
| 38 |
+
def clients(request):
|
| 39 |
+
# if not hasattr(request, 'api_client'):
|
| 40 |
+
# return JsonResponse({'error': 'Unauthorized'}, status=401)
|
| 41 |
+
|
| 42 |
+
clients = list(APIClient.objects.values('name', 'client_id', 'created_at'))
|
| 43 |
+
return JsonResponse({'clients': clients})
|
| 44 |
+
|
ai_api/api_urls.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.urls import path
|
| 2 |
+
from . import api, controllers
|
| 3 |
+
|
| 4 |
+
urlpatterns = [
|
| 5 |
+
path('', api.index, name='index'),
|
| 6 |
+
path('ping/', api.index, name='index'),
|
| 7 |
+
path('clients/', api.clients, name='clients'),
|
| 8 |
+
path('transcription/', controllers.transcription.TranscriptionAPIView.as_view(), name='transcription'),
|
| 9 |
+
path('classification/', controllers.classification.ClassificationAPIView.as_view(), name='classification'),
|
| 10 |
+
]
|
ai_api/apps.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.apps import AppConfig
|
| 2 |
+
|
| 3 |
+
class AiApiConfig(AppConfig):
|
| 4 |
+
default_auto_field = 'django.db.models.BigAutoField'
|
| 5 |
+
name = 'ai_api'
|
| 6 |
+
|
| 7 |
+
def ready(self):
|
| 8 |
+
from . import globals
|
| 9 |
+
from deepface import DeepFace
|
| 10 |
+
from ai_api.library.devlab_image import DevLabImage
|
| 11 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 12 |
+
import whisper
|
| 13 |
+
import os
|
| 14 |
+
from safetensors import safe_open
|
| 15 |
+
import torch
|
| 16 |
+
|
| 17 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
+
|
| 19 |
+
globals.devlab_image = DevLabImage()
|
| 20 |
+
|
| 21 |
+
# Load HuggingFace tokenizer and model once
|
| 22 |
+
save_path = os.path.join(os.path.dirname(__file__), "ddet_classification")
|
| 23 |
+
print(f"Model path: {save_path}")
|
| 24 |
+
globals.save_path = save_path
|
| 25 |
+
|
| 26 |
+
# Load tokenizer
|
| 27 |
+
try:
|
| 28 |
+
globals.tokenizer = AutoTokenizer.from_pretrained(save_path,device=device)
|
| 29 |
+
print("Tokenizer loaded ✅")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Failed to load tokenizer: {e}")
|
| 32 |
+
globals.tokenizer = None
|
| 33 |
+
|
| 34 |
+
# Check .safetensors before loading model
|
| 35 |
+
try:
|
| 36 |
+
safetensor_file = os.path.join(save_path, "model.safetensors")
|
| 37 |
+
if os.path.exists(safetensor_file):
|
| 38 |
+
with safe_open(safetensor_file, framework="pt") as f:
|
| 39 |
+
print("Safetensors file checked ✅")
|
| 40 |
+
|
| 41 |
+
globals.model = AutoModelForSequenceClassification.from_pretrained(save_path)
|
| 42 |
+
globals.model.eval()
|
| 43 |
+
print("Classification model loaded ✅")
|
| 44 |
+
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Failed to load classification model: {e}")
|
| 47 |
+
globals.model = None
|
| 48 |
+
|
| 49 |
+
# Load Whisper model
|
| 50 |
+
try:
|
| 51 |
+
globals.whisper_model = whisper.load_model("large",device=device)
|
| 52 |
+
print("Whisper model loaded ✅")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"Failed to load Whisper model: {e}")
|
| 55 |
+
globals.whisper_model = None
|
| 56 |
+
|
| 57 |
+
# Load FaceNet model
|
| 58 |
+
try:
|
| 59 |
+
globals.facenet_model = DeepFace.build_model("Facenet")
|
| 60 |
+
print("Facenet model loaded ✅")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f"Failed to load FaceNet model: {e}")
|
| 63 |
+
globals.facenet_model = None
|
ai_api/controllers/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from . import transcription
|
| 2 |
+
from . import classification
|
ai_api/controllers/classification.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# classification.py
|
| 2 |
+
from rest_framework.views import APIView
|
| 3 |
+
from rest_framework.response import Response
|
| 4 |
+
from rest_framework import status
|
| 5 |
+
from ..request_serializer import ClassificationRequestSerializer
|
| 6 |
+
|
| 7 |
+
class ClassificationAPIView(APIView):
|
| 8 |
+
def get(self, request):
|
| 9 |
+
return Response({"message": "Classification API"})
|
| 10 |
+
|
| 11 |
+
def post(self, request):
|
| 12 |
+
serializer = ClassificationRequestSerializer(data=request.data)
|
| 13 |
+
if serializer.is_valid():
|
| 14 |
+
return Response({"message": "Classification API"})
|
| 15 |
+
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
|
ai_api/controllers/transcription.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# transcription.py
|
| 2 |
+
from rest_framework.views import APIView
|
| 3 |
+
from rest_framework.response import Response
|
| 4 |
+
from rest_framework import status
|
| 5 |
+
from ..request_serializer import TranscriptionRequestSerializer
|
| 6 |
+
|
| 7 |
+
class TranscriptionAPIView(APIView):
|
| 8 |
+
def get(self, request):
|
| 9 |
+
return Response({"message": "Transcription API"})
|
| 10 |
+
|
| 11 |
+
def post(self, request):
|
| 12 |
+
serializer = TranscriptionRequestSerializer(data=request.data)
|
| 13 |
+
if serializer.is_valid():
|
| 14 |
+
media_file = request.FILES.get('media')
|
| 15 |
+
return Response({"media_file": media_file.name})
|
| 16 |
+
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
|
ai_api/forms.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django import forms
|
| 2 |
+
from .widgets import MultipleFileInput
|
| 3 |
+
from django.core.exceptions import ValidationError
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ImageUploadForm(forms.Form):
|
| 7 |
+
image = forms.ImageField(
|
| 8 |
+
widget=forms.ClearableFileInput(attrs={
|
| 9 |
+
'class': 'form-control',
|
| 10 |
+
'capture': 'user'
|
| 11 |
+
})
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
class ClassificationForm(forms.Form):
|
| 15 |
+
claim = forms.CharField(
|
| 16 |
+
label="Claim:",
|
| 17 |
+
widget=forms.Textarea(attrs={
|
| 18 |
+
'class': 'form-control',
|
| 19 |
+
'rows': 5,
|
| 20 |
+
'placeholder': 'Enter your claim or statement',
|
| 21 |
+
})
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
class RegisterFaceForm(forms.Form):
|
| 25 |
+
person = forms.CharField(
|
| 26 |
+
label="Person:",
|
| 27 |
+
widget=forms.TextInput(attrs={
|
| 28 |
+
'class': 'form-control',
|
| 29 |
+
'placeholder': 'e.g: ANWAR IBRAHIM',
|
| 30 |
+
})
|
| 31 |
+
)
|
| 32 |
+
keywords = forms.CharField(
|
| 33 |
+
label="Keyword:",
|
| 34 |
+
required=False,
|
| 35 |
+
widget=forms.TextInput(attrs={
|
| 36 |
+
'class': 'form-control',
|
| 37 |
+
'placeholder': 'e.g: Prime Minister of Malaysia',
|
| 38 |
+
})
|
| 39 |
+
)
|
| 40 |
+
images = forms.FileField(
|
| 41 |
+
required=False,
|
| 42 |
+
widget=MultipleFileInput(attrs={
|
| 43 |
+
'multiple': True,
|
| 44 |
+
'class': 'form-control',
|
| 45 |
+
'capture': 'user'
|
| 46 |
+
})
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
class TranscribeForm(forms.Form):
|
| 50 |
+
url = forms.CharField(
|
| 51 |
+
label="YouTube URL:",
|
| 52 |
+
required=False,
|
| 53 |
+
widget=forms.TextInput(attrs={
|
| 54 |
+
'type': 'url',
|
| 55 |
+
'class': 'form-control',
|
| 56 |
+
'placeholder': 'Enter YouTube URL',
|
| 57 |
+
|
| 58 |
+
})
|
| 59 |
+
)
|
| 60 |
+
file = forms.FileField(
|
| 61 |
+
label="Upload Audio/Video File",
|
| 62 |
+
required=False,
|
| 63 |
+
widget=forms.ClearableFileInput(attrs={
|
| 64 |
+
'class': 'form-control',
|
| 65 |
+
'accept': 'audio/*,video/*',
|
| 66 |
+
|
| 67 |
+
})
|
| 68 |
+
)
|
| 69 |
+
def clean(self):
|
| 70 |
+
cleaned_data = super().clean()
|
| 71 |
+
url = cleaned_data.get("url")
|
| 72 |
+
file = cleaned_data.get("file")
|
| 73 |
+
|
| 74 |
+
if not url and not file:
|
| 75 |
+
raise ValidationError("You must provide either a YouTube URL or upload a file.")
|
| 76 |
+
if url and file:
|
| 77 |
+
raise ValidationError("Please provide only one: YouTube URL or a file upload.")
|
| 78 |
+
|
| 79 |
+
class YouTubeURLForm(forms.Form):
|
| 80 |
+
youtube_url = forms.URLField(
|
| 81 |
+
label='YouTube Video URL',
|
| 82 |
+
widget=forms.URLInput(attrs={
|
| 83 |
+
'class': 'form-control',
|
| 84 |
+
'placeholder': 'https://www.youtube.com/watch?v=example'
|
| 85 |
+
})
|
| 86 |
+
)
|
ai_api/globals.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
devlab_image = None
|
| 2 |
+
tokenizer = None
|
| 3 |
+
model = None
|
| 4 |
+
save_path = None
|
| 5 |
+
whisper_model = None
|
| 6 |
+
facenet_model = None
|
ai_api/library/apify_scraper.py
ADDED
|
@@ -0,0 +1,893 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# apify_scraper.py
|
| 2 |
+
# Updated version: Uses separate Apify tokens for Facebook and TikTok tasks
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
import time
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import hashlib
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
|
| 12 |
+
# Create cache directory
|
| 13 |
+
CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")
|
| 14 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
# Import configuration settings
|
| 17 |
+
try:
|
| 18 |
+
from .config import (
|
| 19 |
+
# API tokens
|
| 20 |
+
APIFY_TOKEN, APIFY_TOKEN_FB, APIFY_TOKEN_TIKTOK,
|
| 21 |
+
# Task IDs
|
| 22 |
+
POST_TASK_ID_SEARCH, COMMENT_TASK_ID, TIKTOK_VIDEO_TASK_ID, TIKTOK_COMMENT_TASK_ID,
|
| 23 |
+
# Data source settings
|
| 24 |
+
USE_FACEBOOK, USE_TIKTOK, USE_SERPAPI, USE_SERPER, USE_DUCKDUCKGO, USE_LOWYAT,
|
| 25 |
+
# Comment settings
|
| 26 |
+
USE_COMMENTS,
|
| 27 |
+
# Result limits
|
| 28 |
+
FACEBOOK_MAX_RESULTS, TIKTOK_MAX_RESULTS, WEB_SEARCH_MAX_RESULTS, LOWYAT_MAX_THREADS,
|
| 29 |
+
# Lowyat Forum settings
|
| 30 |
+
LOWYAT_SECTIONS
|
| 31 |
+
)
|
| 32 |
+
# Use settings from config
|
| 33 |
+
print("[✓] Using configuration from config.py")
|
| 34 |
+
except ImportError:
|
| 35 |
+
# Fallback to hardcoded settings
|
| 36 |
+
print("[⚠️] Config not found, using hardcoded settings")
|
| 37 |
+
# API tokens
|
| 38 |
+
APIFY_TOKEN = "apify_api_INtF6uUT4c6nOStYDYTllxuTBNSbng1IlTTB"
|
| 39 |
+
#APIFY_TOKEN_FB = APIFY_TOKEN
|
| 40 |
+
#APIFY_TOKEN_TIKTOK = APIFY_TOKEN
|
| 41 |
+
|
| 42 |
+
# Actor task IDs
|
| 43 |
+
#POST_TASK_ID_SEARCH = "l5DitJrtfCyOfrjn6" # Facebook Search PPR (rajamohd/facebook-search-ppr-rm-bernama)
|
| 44 |
+
#COMMENT_TASK_ID = "qiAp6PQwkyYcLQiyC" # Facebook Comments Scraper (rajamohd/facebook-comments-scraper-task)
|
| 45 |
+
TIKTOK_VIDEO_TASK_ID = "rfk0BzRAjuLPbccaZ" # TikTok Data Extractor (devlab/tiktok-data-extractor-bernama2-video)
|
| 46 |
+
TIKTOK_COMMENT_TASK_ID = "rgXeWIhnXKRD5bjGp" # TikTok Comments Scraper (devlab/tiktok-comments-scraper-bernama2)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# Data source settings
|
| 51 |
+
USE_FACEBOOK = True
|
| 52 |
+
USE_TIKTOK = True
|
| 53 |
+
USE_SERPAPI = True
|
| 54 |
+
USE_SERPER = True
|
| 55 |
+
USE_DUCKDUCKGO = True
|
| 56 |
+
USE_LOWYAT = True
|
| 57 |
+
|
| 58 |
+
# Comment settings
|
| 59 |
+
USE_COMMENTS = True
|
| 60 |
+
|
| 61 |
+
# Result limits
|
| 62 |
+
FACEBOOK_MAX_RESULTS = 100
|
| 63 |
+
TIKTOK_MAX_RESULTS = 50
|
| 64 |
+
WEB_SEARCH_MAX_RESULTS = 20
|
| 65 |
+
LOWYAT_MAX_THREADS = 20
|
| 66 |
+
|
| 67 |
+
# Lowyat Forum settings
|
| 68 |
+
LOWYAT_SECTIONS = ["Kopitiam", "SeriousKopitiam", "Finance"]
|
| 69 |
+
|
| 70 |
+
def run(keywords, output_path="output/claim_data.csv", fetch_comments=True, max_videos=30, max_comments=50, max_results=None):
|
| 71 |
+
"""Run data collection from multiple sources and combine results
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
keywords (list): List of keywords to search for
|
| 75 |
+
output_path (str): Path to save combined results
|
| 76 |
+
fetch_comments (bool): Whether to fetch comments for TikTok videos
|
| 77 |
+
max_videos (int): Maximum number of TikTok videos to fetch per keyword
|
| 78 |
+
max_comments (int): Maximum number of comments to fetch per TikTok video
|
| 79 |
+
max_results (int): Maximum results per source (overrides config settings)
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
pandas.DataFrame: Combined results from all sources
|
| 83 |
+
"""
|
| 84 |
+
all_records = []
|
| 85 |
+
|
| 86 |
+
# Use config settings if max_results not specified
|
| 87 |
+
fb_max = max_results or FACEBOOK_MAX_RESULTS
|
| 88 |
+
tiktok_max = max_results or TIKTOK_MAX_RESULTS
|
| 89 |
+
web_max = max_results or WEB_SEARCH_MAX_RESULTS
|
| 90 |
+
|
| 91 |
+
# Create output directory if it doesn't exist
|
| 92 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 93 |
+
# os.makedirs(output_path, exist_ok=True)
|
| 94 |
+
|
| 95 |
+
# Create a summary of data sources
|
| 96 |
+
sources_enabled = []
|
| 97 |
+
if USE_FACEBOOK: sources_enabled.append("Facebook")
|
| 98 |
+
if USE_TIKTOK: sources_enabled.append("TikTok")
|
| 99 |
+
if USE_SERPAPI: sources_enabled.append("SerpApi")
|
| 100 |
+
if USE_SERPER: sources_enabled.append("Serper.dev")
|
| 101 |
+
if USE_DUCKDUCKGO: sources_enabled.append("DuckDuckGo")
|
| 102 |
+
if USE_LOWYAT: sources_enabled.append("Lowyat Forum")
|
| 103 |
+
|
| 104 |
+
print(f"[📊] Data collection enabled for: {', '.join(sources_enabled)}")
|
| 105 |
+
print(f"[🔍] Original Keywords: {', '.join(keywords)}")
|
| 106 |
+
|
| 107 |
+
# Optimize keywords for different platforms
|
| 108 |
+
try:
|
| 109 |
+
from tiktok_keyword_formatter import optimize_keywords_for_platforms
|
| 110 |
+
optimized_keywords = optimize_keywords_for_platforms(keywords)
|
| 111 |
+
tiktok_keywords = optimized_keywords["tiktok"]
|
| 112 |
+
web_keywords = optimized_keywords["web_search"]
|
| 113 |
+
|
| 114 |
+
print(f"[🔍] TikTok Keywords: {', '.join(tiktok_keywords)}")
|
| 115 |
+
print(f"[🔍] Web Search Keywords: {', '.join(web_keywords)}")
|
| 116 |
+
except ImportError:
|
| 117 |
+
print("[⚠️] Keyword formatter not found. Using original keywords for all platforms.")
|
| 118 |
+
tiktok_keywords = keywords
|
| 119 |
+
web_keywords = keywords
|
| 120 |
+
|
| 121 |
+
# Facebook post search
|
| 122 |
+
if USE_FACEBOOK:
|
| 123 |
+
try:
|
| 124 |
+
boolean_query = build_boolean_search(keywords)
|
| 125 |
+
print(f"[📘] Facebook: {boolean_query}")
|
| 126 |
+
post_input = {"search": boolean_query, "resultsPerPage": min(fb_max, 100)}
|
| 127 |
+
|
| 128 |
+
post_dataset_id = run_actor_task(POST_TASK_ID_SEARCH, post_input, platform="facebook")
|
| 129 |
+
posts = download_dataset(post_dataset_id, platform="facebook")
|
| 130 |
+
print(f"[📘] Retrieved {len(posts)} Facebook posts")
|
| 131 |
+
|
| 132 |
+
fb_records = []
|
| 133 |
+
for post in posts:
|
| 134 |
+
# Check if this is Malaysian content
|
| 135 |
+
username = post.get("username", "")
|
| 136 |
+
text = post.get("text", "")
|
| 137 |
+
post_url = post.get("url")
|
| 138 |
+
|
| 139 |
+
if is_malaysian_content(username, text):
|
| 140 |
+
# Add the post itself
|
| 141 |
+
post_record = {
|
| 142 |
+
"platform": "facebook",
|
| 143 |
+
"date": post.get("createdAt"),
|
| 144 |
+
"username": username,
|
| 145 |
+
"post_text": text,
|
| 146 |
+
"post_url": post_url,
|
| 147 |
+
"likes": post.get("likes", 0),
|
| 148 |
+
"shares": post.get("shares", 0),
|
| 149 |
+
"comments_count": post.get("commentsCount", 0),
|
| 150 |
+
"comment_text": "",
|
| 151 |
+
"combined_text": text
|
| 152 |
+
}
|
| 153 |
+
fb_records.append(post_record)
|
| 154 |
+
|
| 155 |
+
# If comments are enabled and the post has comments, scrape them
|
| 156 |
+
if USE_COMMENTS and post.get("commentsCount", 0) > 0 and post_url:
|
| 157 |
+
try:
|
| 158 |
+
print(f"[💬] Scraping comments for Facebook post: {post_url}")
|
| 159 |
+
comment_input = {"url": post_url, "maxComments": 50}
|
| 160 |
+
comment_dataset_id = run_actor_task(COMMENT_TASK_ID, comment_input, platform="facebook")
|
| 161 |
+
comments = download_dataset(comment_dataset_id, platform="facebook")
|
| 162 |
+
print(f"[💬] Retrieved {len(comments)} comments for post")
|
| 163 |
+
|
| 164 |
+
for comment in comments:
|
| 165 |
+
comment_text = comment.get("text", "")
|
| 166 |
+
comment_username = comment.get("name", "")
|
| 167 |
+
|
| 168 |
+
if is_malaysian_content(comment_username, comment_text):
|
| 169 |
+
comment_record = {
|
| 170 |
+
"platform": "facebook_comment",
|
| 171 |
+
"date": comment.get("date"),
|
| 172 |
+
"username": comment_username,
|
| 173 |
+
"post_text": "",
|
| 174 |
+
"post_url": post_url,
|
| 175 |
+
"likes": comment.get("likes", 0),
|
| 176 |
+
"shares": 0,
|
| 177 |
+
"comments_count": 0,
|
| 178 |
+
"comment_text": comment_text,
|
| 179 |
+
"combined_text": comment_text
|
| 180 |
+
}
|
| 181 |
+
fb_records.append(comment_record)
|
| 182 |
+
except Exception as e:
|
| 183 |
+
print(f"[❌] Error scraping comments for post {post_url}: {str(e)}")
|
| 184 |
+
print("[⚠️] Continuing with next post...")
|
| 185 |
+
|
| 186 |
+
print(f"[📊] Added {len(fb_records)} Facebook records after filtering")
|
| 187 |
+
all_records.extend(fb_records)
|
| 188 |
+
except Exception as e:
|
| 189 |
+
print(f"[❌] Error during Facebook scraping: {str(e)}")
|
| 190 |
+
print("[⚠️] Continuing with other data sources...")
|
| 191 |
+
|
| 192 |
+
# TikTok scraping
|
| 193 |
+
if USE_TIKTOK:
|
| 194 |
+
try:
|
| 195 |
+
print(f"[📽️] TikTok: Searching for {', '.join(tiktok_keywords)}")
|
| 196 |
+
tiktok_records = []
|
| 197 |
+
|
| 198 |
+
# Use only the top 3 most relevant keywords as requested
|
| 199 |
+
top_keywords = tiktok_keywords[:min(3, len(tiktok_keywords))]
|
| 200 |
+
print(f"[📽️] Using top {len(top_keywords)} TikTok keywords: {', '.join(top_keywords)}")
|
| 201 |
+
|
| 202 |
+
# Set video limits as requested by user
|
| 203 |
+
videos_per_keyword = max_videos # Use the parameter value
|
| 204 |
+
|
| 205 |
+
# No total video limit - collect exactly max_videos per keyword
|
| 206 |
+
total_videos_collected = 0
|
| 207 |
+
max_total_videos = max_videos * len(top_keywords) # Allow max_videos per keyword
|
| 208 |
+
|
| 209 |
+
# for keyword in top_keywords:
|
| 210 |
+
try:
|
| 211 |
+
# Print detailed debugging information
|
| 212 |
+
print(f"[📽️] DEBUG: TikTok API Token: {APIFY_TOKEN_TIKTOK[:5]}...{APIFY_TOKEN_TIKTOK[-5:]}")
|
| 213 |
+
print(f"[📽️] DEBUG: TikTok Video Task ID: {TIKTOK_VIDEO_TASK_ID}")
|
| 214 |
+
print(f"[📽️] DEBUG: TikTok Comment Task ID: {TIKTOK_COMMENT_TASK_ID}")
|
| 215 |
+
|
| 216 |
+
keyword = ', '.join(tiktok_keywords)
|
| 217 |
+
|
| 218 |
+
# Limit videos per keyword to save costs
|
| 219 |
+
tiktok_input = { "searchQueries": [keyword], "maxVideos": videos_per_keyword}
|
| 220 |
+
# tiktok_input ={"searchQueries": keyword}
|
| 221 |
+
print(f"[📽️] Requesting {videos_per_keyword} TikTok videos for: {keyword}")
|
| 222 |
+
print(f"[📽️] DEBUG: Full input payload: {tiktok_input}")
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
tiktok_dataset_id = run_actor_task(TIKTOK_VIDEO_TASK_ID, tiktok_input, platform="tiktok")
|
| 227 |
+
print(f"[📽️] DEBUG: Successfully got dataset ID: {tiktok_dataset_id}")
|
| 228 |
+
videos = download_dataset(tiktok_dataset_id, platform="tiktok")
|
| 229 |
+
print(f"[📽️] Retrieved {len(videos)} TikTok videos for: {keyword}")
|
| 230 |
+
except Exception as e:
|
| 231 |
+
print(f"[❌] DETAILED ERROR in TikTok video extraction: {str(e)}")
|
| 232 |
+
print(f"[❌] Error type: {type(e).__name__}")
|
| 233 |
+
import traceback
|
| 234 |
+
print(f"[❌] Traceback: {traceback.format_exc()}")
|
| 235 |
+
videos = []
|
| 236 |
+
|
| 237 |
+
for video in videos:
|
| 238 |
+
# Check if we've reached the maximum total videos limit
|
| 239 |
+
if total_videos_collected >= max_total_videos:
|
| 240 |
+
print(f"[⚠️] Reached maximum limit of {max_total_videos} videos. Stopping collection.")
|
| 241 |
+
break
|
| 242 |
+
|
| 243 |
+
username = video.get("authorMeta", {}).get("userName", "") or video.get("authorMeta", {}).get("name", "")
|
| 244 |
+
caption = video.get("text", "")
|
| 245 |
+
|
| 246 |
+
if is_malaysian_content(username, caption):
|
| 247 |
+
# Increment the total videos counter
|
| 248 |
+
total_videos_collected += 1
|
| 249 |
+
video_url = video.get("webVideoUrl") or video.get("videoUrl")
|
| 250 |
+
clean_url = video_url.split("?")[0] if video_url and "/video/" in video_url else None
|
| 251 |
+
|
| 252 |
+
video_record = {
|
| 253 |
+
"platform": "tiktok",
|
| 254 |
+
"date": video.get("createTimeISO") or video.get("createTime"),
|
| 255 |
+
"username": username,
|
| 256 |
+
"post_text": caption,
|
| 257 |
+
"post_url": clean_url,
|
| 258 |
+
"likes": video.get("diggCount", 0),
|
| 259 |
+
"shares": video.get("shareCount", 0),
|
| 260 |
+
"comments_count": video.get("commentCount", 0),
|
| 261 |
+
"comment_text": "",
|
| 262 |
+
"combined_text": caption
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
tiktok_records.append(video_record)
|
| 266 |
+
|
| 267 |
+
# If comments are enabled and the video has comments, scrape them
|
| 268 |
+
# Get comments per video as requested by the user
|
| 269 |
+
min_comments_threshold = 5 # Lower threshold to ensure we get comments
|
| 270 |
+
max_comments_to_scrape = max_comments # Use the parameter value
|
| 271 |
+
max_videos_with_comments = 10 # Allow more videos with comments
|
| 272 |
+
|
| 273 |
+
# Track how many videos we've scraped comments for
|
| 274 |
+
if not hasattr(run, 'videos_with_comments_count'):
|
| 275 |
+
run.videos_with_comments_count = 0
|
| 276 |
+
|
| 277 |
+
if (fetch_comments and
|
| 278 |
+
run.videos_with_comments_count < max_videos_with_comments and
|
| 279 |
+
video.get("commentCount", 0) >= min_comments_threshold and
|
| 280 |
+
clean_url and
|
| 281 |
+
video.get("diggCount", 0) > 10): # Very low threshold to ensure we get comments for most videos
|
| 282 |
+
try:
|
| 283 |
+
print(f"[💬] Scraping comments for popular TikTok video ({run.videos_with_comments_count+1}/{max_videos_with_comments}): {clean_url}")
|
| 284 |
+
comment_input = {"postURLs": [clean_url], "commentsPerPost": max_comments_to_scrape}
|
| 285 |
+
print(f"[💬] DEBUG: Comment input payload: {comment_input}")
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
comment_dataset_id = run_actor_task(TIKTOK_COMMENT_TASK_ID, comment_input, platform="tiktok")
|
| 289 |
+
print(f"[💬] DEBUG: Successfully got comment dataset ID: {comment_dataset_id}")
|
| 290 |
+
comments = download_dataset(comment_dataset_id, platform="tiktok")
|
| 291 |
+
run.videos_with_comments_count += 1
|
| 292 |
+
print(f"[💬] Retrieved {len(comments)} comments for video")
|
| 293 |
+
except Exception as e:
|
| 294 |
+
print(f"[❌] DETAILED ERROR in TikTok comment extraction: {str(e)}")
|
| 295 |
+
print(f"[❌] Error type: {type(e).__name__}")
|
| 296 |
+
import traceback
|
| 297 |
+
print(f"[❌] Traceback: {traceback.format_exc()}")
|
| 298 |
+
comments = []
|
| 299 |
+
|
| 300 |
+
for comment in comments:
|
| 301 |
+
comment_text = comment.get("text", "")
|
| 302 |
+
comment_username = comment.get("author", {}).get("uniqueId", "") or comment.get("author", {}).get("nickname", "")
|
| 303 |
+
|
| 304 |
+
if is_malaysian_content(comment_username, comment_text):
|
| 305 |
+
comment_record = {
|
| 306 |
+
"platform": "tiktok_comment",
|
| 307 |
+
"date": comment.get("createTime"),
|
| 308 |
+
"username": comment_username,
|
| 309 |
+
"post_text": "",
|
| 310 |
+
"post_url": clean_url,
|
| 311 |
+
"likes": comment.get("diggCount", 0),
|
| 312 |
+
"shares": 0,
|
| 313 |
+
"comments_count": 0,
|
| 314 |
+
"comment_text": comment_text,
|
| 315 |
+
"combined_text": comment_text
|
| 316 |
+
}
|
| 317 |
+
tiktok_records.append(comment_record)
|
| 318 |
+
except Exception as e:
|
| 319 |
+
print(f"[❌] Error scraping comments for video {clean_url}: {str(e)}")
|
| 320 |
+
print("[⚠️] Continuing with next video...")
|
| 321 |
+
# Check if we've reached the maximum total videos limit after processing this keyword
|
| 322 |
+
if total_videos_collected >= max_total_videos:
|
| 323 |
+
print(f"[⚠️] Reached maximum limit of {max_total_videos} videos. Stopping keyword search.")
|
| 324 |
+
break
|
| 325 |
+
except Exception as e:
|
| 326 |
+
print(f"[❌] Error processing TikTok keyword '{keyword}': {str(e)}")
|
| 327 |
+
print("[⚠️] Continuing with next keyword...")
|
| 328 |
+
|
| 329 |
+
print(f"[📊] Added {len(tiktok_records)} TikTok records after filtering")
|
| 330 |
+
all_records.extend(tiktok_records)
|
| 331 |
+
except Exception as e:
|
| 332 |
+
print(f"[❌] Error during TikTok scraping: {str(e)}")
|
| 333 |
+
print("[⚠️] Continuing with other data sources...")
|
| 334 |
+
|
| 335 |
+
# Web search (SerpApi, Serper.dev, DuckDuckGo)
|
| 336 |
+
if USE_SERPAPI or USE_SERPER or USE_DUCKDUCKGO:
|
| 337 |
+
try:
|
| 338 |
+
print(f"[🌐] Web Search: Searching for {', '.join(web_keywords)}")
|
| 339 |
+
web_search_output = f"output/{os.path.basename(output_path).split('.')[0]}_web.csv"
|
| 340 |
+
|
| 341 |
+
# Try to import the run_web_search function
|
| 342 |
+
try:
|
| 343 |
+
from run_web_search import run_web_search
|
| 344 |
+
|
| 345 |
+
# Get the full claim from the environment if available
|
| 346 |
+
full_claim = os.environ.get("FULL_CLAIM", None)
|
| 347 |
+
if full_claim:
|
| 348 |
+
print(f"[🔍] Using full claim for web search: {full_claim}")
|
| 349 |
+
|
| 350 |
+
# Pass configuration settings to run_web_search
|
| 351 |
+
web_results_count = run_web_search(
|
| 352 |
+
web_keywords,
|
| 353 |
+
web_search_output,
|
| 354 |
+
num_results=web_max,
|
| 355 |
+
use_serpapi=USE_SERPAPI,
|
| 356 |
+
use_serper=USE_SERPER,
|
| 357 |
+
use_duckduckgo=USE_DUCKDUCKGO,
|
| 358 |
+
full_claim=full_claim
|
| 359 |
+
)
|
| 360 |
+
print(f"[🌐] Retrieved {web_results_count} web search results")
|
| 361 |
+
|
| 362 |
+
# If web search was successful, read the results and add to all_records
|
| 363 |
+
if web_results_count > 0:
|
| 364 |
+
try:
|
| 365 |
+
web_df = pd.read_csv(web_search_output)
|
| 366 |
+
web_records = web_df.to_dict('records')
|
| 367 |
+
all_records.extend(web_records)
|
| 368 |
+
print(f"[📊] Added {len(web_records)} web search records")
|
| 369 |
+
except Exception as e:
|
| 370 |
+
print(f"[❌] Error reading web search results: {str(e)}")
|
| 371 |
+
except ImportError:
|
| 372 |
+
print("[⚠️] Web search module not found. Skipping web search.")
|
| 373 |
+
except Exception as e:
|
| 374 |
+
print(f"[❌] Error during web search: {str(e)}")
|
| 375 |
+
|
| 376 |
+
# Lowyat Forum data collection
|
| 377 |
+
if USE_LOWYAT:
|
| 378 |
+
try:
|
| 379 |
+
print(f"[📚] Collecting data from Lowyat Forum...")
|
| 380 |
+
|
| 381 |
+
# Import the Lowyat Forum crawler
|
| 382 |
+
try:
|
| 383 |
+
from lowyat_crawler import run_lowyat_crawler
|
| 384 |
+
|
| 385 |
+
# Use the same keywords for Lowyat Forum
|
| 386 |
+
lowyat_keywords = keywords
|
| 387 |
+
|
| 388 |
+
# Check for environment variable override for sections
|
| 389 |
+
sections_to_use = LOWYAT_SECTIONS
|
| 390 |
+
if os.environ.get("LOWYAT_SECTIONS"):
|
| 391 |
+
sections_to_use = os.environ.get("LOWYAT_SECTIONS").split(",")
|
| 392 |
+
print(f"[📚] Using Lowyat Forum sections from environment: {', '.join(sections_to_use)}")
|
| 393 |
+
|
| 394 |
+
# Get the full claim from the environment if available
|
| 395 |
+
full_claim = os.environ.get("FULL_CLAIM", None)
|
| 396 |
+
if full_claim:
|
| 397 |
+
print(f"[🔍] Using full claim for Lowyat Forum search: {full_claim}")
|
| 398 |
+
|
| 399 |
+
# Get Lowyat Forum data
|
| 400 |
+
lowyat_output_path = output_path.replace(".csv", "_lowyat.csv")
|
| 401 |
+
try:
|
| 402 |
+
lowyat_df = run_lowyat_crawler(
|
| 403 |
+
lowyat_keywords,
|
| 404 |
+
sections=sections_to_use,
|
| 405 |
+
max_threads=LOWYAT_MAX_THREADS,
|
| 406 |
+
output_path=lowyat_output_path,
|
| 407 |
+
full_claim=full_claim
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# Convert DataFrame to records and add to all_records
|
| 411 |
+
if not lowyat_df.empty:
|
| 412 |
+
lowyat_records = lowyat_df.to_dict('records')
|
| 413 |
+
all_records.extend(lowyat_records)
|
| 414 |
+
print(f"[📚] Added {len(lowyat_records)} Lowyat Forum records")
|
| 415 |
+
else:
|
| 416 |
+
print(f"[⚠️] No Lowyat Forum data found for keywords: {', '.join(lowyat_keywords)}")
|
| 417 |
+
|
| 418 |
+
# Generate sample data for testing if needed
|
| 419 |
+
if os.environ.get("GENERATE_SAMPLE_LOWYAT_DATA", "false").lower() == "true":
|
| 420 |
+
print("[📚] Generating sample Lowyat Forum data for testing...")
|
| 421 |
+
|
| 422 |
+
# Create a sample dataframe with the claim
|
| 423 |
+
from datetime import datetime
|
| 424 |
+
current_date = datetime.now().strftime('%Y-%m-%d')
|
| 425 |
+
|
| 426 |
+
# Get the claim text or keywords
|
| 427 |
+
claim_text = full_claim if full_claim else ', '.join(lowyat_keywords)
|
| 428 |
+
|
| 429 |
+
# Create relevant sample data based on claim content
|
| 430 |
+
sample_data = []
|
| 431 |
+
|
| 432 |
+
# Check for different types of claims and create relevant sample data
|
| 433 |
+
if any(term in claim_text.lower() for term in ['hon', 'tenonet', 'kenderaan', 'kereta']):
|
| 434 |
+
# Horn/vehicle related claim
|
| 435 |
+
sample_data.append({
|
| 436 |
+
'platform': 'LowyatForum',
|
| 437 |
+
'date': current_date,
|
| 438 |
+
'username': 'CarEnthusiast',
|
| 439 |
+
'post_text': f"Adakah sesiapa tahu tentang undang-undang berkaitan hon tenonet? Saya dengar JPJ sedang menjalankan operasi terhadap kenderaan yang menggunakan hon jenis ini.",
|
| 440 |
+
'post_url': 'https://forum.lowyat.net/topic/hon-tenonet',
|
| 441 |
+
'likes': 15,
|
| 442 |
+
'shares': 3,
|
| 443 |
+
'comments_count': 8,
|
| 444 |
+
'comment_text': '',
|
| 445 |
+
'combined_text': f"Adakah sesiapa tahu tentang undang-undang berkaitan hon tenonet? Saya dengar JPJ sedang menjalankan operasi terhadap kenderaan yang menggunakan hon jenis ini."
|
| 446 |
+
})
|
| 447 |
+
|
| 448 |
+
sample_data.append({
|
| 449 |
+
'platform': 'LowyatForum_Comment',
|
| 450 |
+
'date': current_date,
|
| 451 |
+
'username': 'LegalExpert',
|
| 452 |
+
'post_text': '',
|
| 453 |
+
'post_url': 'https://forum.lowyat.net/topic/hon-tenonet#comment1',
|
| 454 |
+
'likes': 7,
|
| 455 |
+
'shares': 0,
|
| 456 |
+
'comments_count': 0,
|
| 457 |
+
'comment_text': "Ya, penggunaan hon tenonet adalah menyalahi undang-undang kerana boleh mengelirukan pemandu lain dan menyebabkan kemalangan. Denda boleh mencecah RM2,000.",
|
| 458 |
+
'combined_text': "Ya, penggunaan hon tenonet adalah menyalahi undang-undang kerana boleh mengelirukan pemandu lain dan menyebabkan kemalangan. Denda boleh mencecah RM2,000."
|
| 459 |
+
})
|
| 460 |
+
|
| 461 |
+
elif any(term in claim_text.lower() for term in ['kelantan', 'rogol', 'sumbang mahram', 'jenayah']):
|
| 462 |
+
# Crime in Kelantan related claim
|
| 463 |
+
sample_data.append({
|
| 464 |
+
'platform': 'LowyatForum',
|
| 465 |
+
'date': current_date,
|
| 466 |
+
'username': 'SocialObserver',
|
| 467 |
+
'post_text': f"Statistik jenayah seksual di Kelantan semakin membimbangkan. Menurut laporan polis, kes rogol dan sumbang mahram meningkat sebanyak 15% tahun ini.",
|
| 468 |
+
'post_url': 'https://forum.lowyat.net/topic/crime-statistics',
|
| 469 |
+
'likes': 12,
|
| 470 |
+
'shares': 5,
|
| 471 |
+
'comments_count': 7,
|
| 472 |
+
'comment_text': '',
|
| 473 |
+
'combined_text': f"Statistik jenayah seksual di Kelantan semakin membimbangkan. Menurut laporan polis, kes rogol dan sumbang mahram meningkat sebanyak 15% tahun ini."
|
| 474 |
+
})
|
| 475 |
+
|
| 476 |
+
sample_data.append({
|
| 477 |
+
'platform': 'LowyatForum_Comment',
|
| 478 |
+
'date': current_date,
|
| 479 |
+
'username': 'CommunityLeader',
|
| 480 |
+
'post_text': '',
|
| 481 |
+
'post_url': 'https://forum.lowyat.net/topic/crime-statistics#comment1',
|
| 482 |
+
'likes': 8,
|
| 483 |
+
'shares': 0,
|
| 484 |
+
'comments_count': 0,
|
| 485 |
+
'comment_text': "Kita perlu lebih banyak program kesedaran dan pendidikan untuk menangani masalah ini. Pihak berkuasa juga perlu mengambil tindakan lebih tegas terhadap pesalah.",
|
| 486 |
+
'combined_text': "Kita perlu lebih banyak program kesedaran dan pendidikan untuk menangani masalah ini. Pihak berkuasa juga perlu mengambil tindakan lebih tegas terhadap pesalah."
|
| 487 |
+
})
|
| 488 |
+
|
| 489 |
+
elif any(term in claim_text.lower() for term in ['kelongsong', 'peluru', 'senjata', 'tan']):
|
| 490 |
+
# Ammunition related claim
|
| 491 |
+
sample_data.append({
|
| 492 |
+
'platform': 'LowyatForum',
|
| 493 |
+
'date': current_date,
|
| 494 |
+
'username': 'SecurityAnalyst',
|
| 495 |
+
'post_text': f"Penemuan 50 tan kelongsong dan peluru di kilang haram membimbangkan. Adakah ini menunjukkan ancaman keselamatan yang serius?",
|
| 496 |
+
'post_url': 'https://forum.lowyat.net/topic/security-threat',
|
| 497 |
+
'likes': 25,
|
| 498 |
+
'shares': 10,
|
| 499 |
+
'comments_count': 15,
|
| 500 |
+
'comment_text': '',
|
| 501 |
+
'combined_text': f"Penemuan 50 tan kelongsong dan peluru di kilang haram membimbangkan. Adakah ini menunjukkan ancaman keselamatan yang serius?"
|
| 502 |
+
})
|
| 503 |
+
|
| 504 |
+
sample_data.append({
|
| 505 |
+
'platform': 'LowyatForum_Comment',
|
| 506 |
+
'date': current_date,
|
| 507 |
+
'username': 'DefenseExpert',
|
| 508 |
+
'post_text': '',
|
| 509 |
+
'post_url': 'https://forum.lowyat.net/topic/security-threat#comment1',
|
| 510 |
+
'likes': 18,
|
| 511 |
+
'shares': 0,
|
| 512 |
+
'comments_count': 0,
|
| 513 |
+
'comment_text': "Menurut sumber, kelongsong tersebut adalah untuk dikitar semula dan bukan untuk kegunaan senjata aktif. Namun, ia tetap menyalahi undang-undang kerana tidak mempunyai permit yang sah.",
|
| 514 |
+
'combined_text': "Menurut sumber, kelongsong tersebut adalah untuk dikitar semula dan bukan untuk kegunaan senjata aktif. Namun, ia tetap menyalahi undang-undang kerana tidak mempunyai permit yang sah."
|
| 515 |
+
})
|
| 516 |
+
|
| 517 |
+
elif any(term in claim_text.lower() for term in ['minyak sawit', 'cukai', 'ekonomi']):
|
| 518 |
+
# Palm oil tax related claim
|
| 519 |
+
sample_data.append({
|
| 520 |
+
'platform': 'LowyatForum',
|
| 521 |
+
'date': current_date,
|
| 522 |
+
'username': 'EconomyWatcher',
|
| 523 |
+
'post_text': f"Adakah benar kerajaan akan mengenakan cukai khas terhadap minyak sawit mentah? Ini akan memberi kesan besar kepada industri dan ekonomi negara.",
|
| 524 |
+
'post_url': 'https://forum.lowyat.net/topic/palm-oil-tax',
|
| 525 |
+
'likes': 20,
|
| 526 |
+
'shares': 8,
|
| 527 |
+
'comments_count': 12,
|
| 528 |
+
'comment_text': '',
|
| 529 |
+
'combined_text': f"Adakah benar kerajaan akan mengenakan cukai khas terhadap minyak sawit mentah? Ini akan memberi kesan besar kepada industri dan ekonomi negara."
|
| 530 |
+
})
|
| 531 |
+
|
| 532 |
+
sample_data.append({
|
| 533 |
+
'platform': 'LowyatForum_Comment',
|
| 534 |
+
'date': current_date,
|
| 535 |
+
'username': 'IndustryInsider',
|
| 536 |
+
'post_text': '',
|
| 537 |
+
'post_url': 'https://forum.lowyat.net/topic/palm-oil-tax#comment1',
|
| 538 |
+
'likes': 15,
|
| 539 |
+
'shares': 0,
|
| 540 |
+
'comments_count': 0,
|
| 541 |
+
'comment_text': "Menurut sumber dari kementerian, cadangan cukai ini masih dalam peringkat kajian dan belum ada keputusan muktamad. Namun, jika dilaksanakan, ia akan memberi kesan kepada harga minyak masak.",
|
| 542 |
+
'combined_text': "Menurut sumber dari kementerian, cadangan cukai ini masih dalam peringkat kajian dan belum ada keputusan muktamad. Namun, jika dilaksanakan, ia akan memberi kesan kepada harga minyak masak."
|
| 543 |
+
})
|
| 544 |
+
|
| 545 |
+
else:
|
| 546 |
+
# Default generic sample data if no specific claim type is detected
|
| 547 |
+
sample_data.append({
|
| 548 |
+
'platform': 'LowyatForum',
|
| 549 |
+
'date': current_date,
|
| 550 |
+
'username': 'LowyatUser123',
|
| 551 |
+
'post_text': f"Discussing: {claim_text}",
|
| 552 |
+
'post_url': 'https://forum.lowyat.net/topic/sample',
|
| 553 |
+
'likes': 5,
|
| 554 |
+
'shares': 0,
|
| 555 |
+
'comments_count': 2,
|
| 556 |
+
'comment_text': '',
|
| 557 |
+
'combined_text': f"Discussing: {claim_text}"
|
| 558 |
+
})
|
| 559 |
+
|
| 560 |
+
sample_data.append({
|
| 561 |
+
'platform': 'LowyatForum_Comment',
|
| 562 |
+
'date': current_date,
|
| 563 |
+
'username': 'LowyatCommenter',
|
| 564 |
+
'post_text': '',
|
| 565 |
+
'post_url': 'https://forum.lowyat.net/topic/sample#comment1',
|
| 566 |
+
'likes': 2,
|
| 567 |
+
'shares': 0,
|
| 568 |
+
'comments_count': 0,
|
| 569 |
+
'comment_text': f"Commenting on: {claim_text}",
|
| 570 |
+
'combined_text': f"Commenting on: {claim_text}"
|
| 571 |
+
})
|
| 572 |
+
|
| 573 |
+
# If no sample data was created (unlikely), create a default one
|
| 574 |
+
if not sample_data:
|
| 575 |
+
sample_data.append({
|
| 576 |
+
'platform': 'LowyatForum',
|
| 577 |
+
'date': current_date,
|
| 578 |
+
'username': 'LowyatUser123',
|
| 579 |
+
'post_text': f"Discussing: {claim_text}",
|
| 580 |
+
'post_url': 'https://forum.lowyat.net/topic/sample',
|
| 581 |
+
'likes': 5,
|
| 582 |
+
'shares': 0,
|
| 583 |
+
'comments_count': 2,
|
| 584 |
+
'comment_text': '',
|
| 585 |
+
'combined_text': f"Discussing: {claim_text}"
|
| 586 |
+
})
|
| 587 |
+
|
| 588 |
+
sample_df = pd.DataFrame(sample_data)
|
| 589 |
+
if lowyat_output_path:
|
| 590 |
+
sample_df.to_csv(lowyat_output_path, index=False)
|
| 591 |
+
|
| 592 |
+
all_records.extend(sample_data)
|
| 593 |
+
print(f"[📚] Added {len(sample_data)} sample Lowyat Forum records")
|
| 594 |
+
except Exception as e:
|
| 595 |
+
print(f"[⚠️] Error during Lowyat Forum crawling: {str(e)}")
|
| 596 |
+
print("[⚠️] Continuing without Lowyat Forum data...")
|
| 597 |
+
|
| 598 |
+
except ImportError:
|
| 599 |
+
print("[❌] Lowyat Forum crawler module not found. Skipping Lowyat Forum data collection.")
|
| 600 |
+
|
| 601 |
+
except Exception as e:
|
| 602 |
+
print(f"[❌] Error during Lowyat Forum data collection: {str(e)}")
|
| 603 |
+
print("[⚠️] Continuing with other data sources...")
|
| 604 |
+
|
| 605 |
+
# Save all records to CSV
|
| 606 |
+
if all_records:
|
| 607 |
+
df = pd.DataFrame(all_records)
|
| 608 |
+
df.to_csv(output_path, index=False)
|
| 609 |
+
print(f"[💾] Saved {len(df)} records to {output_path}")
|
| 610 |
+
|
| 611 |
+
# Print summary of data sources
|
| 612 |
+
source_counts = df['platform'].value_counts().to_dict()
|
| 613 |
+
print("\n[📊] Data collection summary:")
|
| 614 |
+
for source, count in source_counts.items():
|
| 615 |
+
# Use shorter display names for Lowyat Forum sources
|
| 616 |
+
display_source = source
|
| 617 |
+
if source == "LowyatForum":
|
| 618 |
+
display_source = "LF"
|
| 619 |
+
elif source == "LowyatForum_Comment":
|
| 620 |
+
display_source = "LF_Comment"
|
| 621 |
+
print(f" - {display_source}: {count} records")
|
| 622 |
+
|
| 623 |
+
return df
|
| 624 |
+
else:
|
| 625 |
+
# Create empty DataFrame and save to CSV
|
| 626 |
+
empty_df = pd.DataFrame(columns=["platform", "date", "username", "post_text", "post_url", "likes", "shares", "comments_count", "comment_text", "combined_text"])
|
| 627 |
+
empty_df.to_csv(output_path, index=False)
|
| 628 |
+
print(f"[⚠️] No records found. Saved empty DataFrame to {output_path}")
|
| 629 |
+
return empty_df
|
| 630 |
+
|
| 631 |
+
def run_actor_task(task_id, input_payload, platform="facebook", timeout=30, max_retries=3, use_cache=True, cache_ttl_hours=24):
|
| 632 |
+
# Generate a cache key based on task_id and input_payload
|
| 633 |
+
cache_key = f"{task_id}_{json.dumps(input_payload, sort_keys=True)}"
|
| 634 |
+
cache_hash = hashlib.md5(cache_key.encode()).hexdigest()
|
| 635 |
+
cache_file = os.path.join(CACHE_DIR, f"{cache_hash}.json")
|
| 636 |
+
|
| 637 |
+
# Check if we have a valid cached result
|
| 638 |
+
if use_cache and os.path.exists(cache_file):
|
| 639 |
+
try:
|
| 640 |
+
with open(cache_file, 'r') as f:
|
| 641 |
+
cache_data = json.load(f)
|
| 642 |
+
|
| 643 |
+
# Check if cache is still valid
|
| 644 |
+
cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
|
| 645 |
+
cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
|
| 646 |
+
|
| 647 |
+
if datetime.now() < cache_expiry:
|
| 648 |
+
print(f"[💾] Using cached result for task {task_id} (expires {cache_expiry.isoformat()})")
|
| 649 |
+
return cache_data.get('dataset_id')
|
| 650 |
+
else:
|
| 651 |
+
print(f"[⏰] Cache expired for task {task_id}, fetching fresh data")
|
| 652 |
+
except Exception as e:
|
| 653 |
+
print(f"[⚠️] Error reading cache: {str(e)}")
|
| 654 |
+
|
| 655 |
+
token = APIFY_TOKEN_FB if platform == "facebook" else APIFY_TOKEN_TIKTOK
|
| 656 |
+
headers = {
|
| 657 |
+
"Authorization": f"Bearer {token}",
|
| 658 |
+
"Content-Type": "application/json"
|
| 659 |
+
}
|
| 660 |
+
url = f"https://api.apify.com/v2/actor-tasks/{task_id}/runs"
|
| 661 |
+
|
| 662 |
+
# Try multiple times in case of network issues
|
| 663 |
+
for attempt in range(max_retries):
|
| 664 |
+
try:
|
| 665 |
+
print(f"[🔄] Attempt {attempt+1}/{max_retries} to run task {task_id}...")
|
| 666 |
+
print(input_payload)
|
| 667 |
+
# response = requests.post(url, json={"input": input_payload}, headers=headers, timeout=timeout)
|
| 668 |
+
response = requests.post(url, json=input_payload, headers=headers, timeout=timeout)
|
| 669 |
+
|
| 670 |
+
if response.status_code != 201:
|
| 671 |
+
print(f"[❌] Failed to run task: {response.text}")
|
| 672 |
+
if attempt < max_retries - 1:
|
| 673 |
+
print("[⏳] Retrying...")
|
| 674 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 675 |
+
continue
|
| 676 |
+
raise Exception(f"Task run failed after {max_retries} attempts.")
|
| 677 |
+
|
| 678 |
+
run_id = response.json()["data"]["id"]
|
| 679 |
+
print(f"[🟢] Task {task_id} started: {run_id}")
|
| 680 |
+
status_url = f"https://api.apify.com/v2/actor-runs/{run_id}"
|
| 681 |
+
break # Success, exit the retry loop
|
| 682 |
+
except requests.exceptions.Timeout:
|
| 683 |
+
print(f"[❌] Request timed out after {timeout} seconds")
|
| 684 |
+
if attempt < max_retries - 1:
|
| 685 |
+
print("[⏳] Retrying...")
|
| 686 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 687 |
+
else:
|
| 688 |
+
raise Exception(f"Task run timed out after {max_retries} attempts.")
|
| 689 |
+
except requests.exceptions.ConnectionError:
|
| 690 |
+
print(f"[❌] Connection error")
|
| 691 |
+
if attempt < max_retries - 1:
|
| 692 |
+
print("[⏳] Retrying...")
|
| 693 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 694 |
+
else:
|
| 695 |
+
raise Exception(f"Connection error after {max_retries} attempts.")
|
| 696 |
+
except Exception as e:
|
| 697 |
+
print(f"[❌] Unexpected error: {str(e)}")
|
| 698 |
+
if attempt < max_retries - 1:
|
| 699 |
+
print("[⏳] Retrying...")
|
| 700 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 701 |
+
else:
|
| 702 |
+
raise Exception(f"Unexpected error after {max_retries} attempts: {str(e)}")
|
| 703 |
+
while True:
|
| 704 |
+
status_data = requests.get(status_url, headers=headers).json()
|
| 705 |
+
if status_data["data"]["status"] in ["SUCCEEDED", "FAILED"]:
|
| 706 |
+
break
|
| 707 |
+
print("[⏳] Waiting for task run to complete...")
|
| 708 |
+
time.sleep(5)
|
| 709 |
+
|
| 710 |
+
if status_data["data"]["status"] == "SUCCEEDED":
|
| 711 |
+
dataset_id = status_data["data"]["defaultDatasetId"]
|
| 712 |
+
|
| 713 |
+
# Save result to cache
|
| 714 |
+
if use_cache:
|
| 715 |
+
try:
|
| 716 |
+
cache_data = {
|
| 717 |
+
"dataset_id": dataset_id,
|
| 718 |
+
"timestamp": datetime.now().isoformat(),
|
| 719 |
+
"task_id": task_id,
|
| 720 |
+
"platform": platform
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
with open(cache_file, 'w') as f:
|
| 724 |
+
json.dump(cache_data, f)
|
| 725 |
+
|
| 726 |
+
print(f"[💾] Saved result to cache: {cache_file}")
|
| 727 |
+
except Exception as e:
|
| 728 |
+
print(f"[⚠️] Error saving to cache: {str(e)}")
|
| 729 |
+
|
| 730 |
+
return dataset_id
|
| 731 |
+
else:
|
| 732 |
+
raise Exception("Task run failed.")
|
| 733 |
+
|
| 734 |
+
def is_malaysian_content(username, text):
|
| 735 |
+
# Check if content is relevant to the claim
|
| 736 |
+
user_lower = (username or "").lower()
|
| 737 |
+
text_lower = (text or "").lower()
|
| 738 |
+
|
| 739 |
+
# Get the full claim from environment if available
|
| 740 |
+
full_claim = os.environ.get("FULL_CLAIM", "")
|
| 741 |
+
claim_lower = full_claim.lower()
|
| 742 |
+
|
| 743 |
+
# Check if this is about sexual crimes in Kelantan
|
| 744 |
+
kelantan_sexual_crime = "kelantan" in claim_lower and ("rogol" in claim_lower or "sumbang mahram" in claim_lower)
|
| 745 |
+
|
| 746 |
+
if kelantan_sexual_crime:
|
| 747 |
+
# For the specific claim about sexual crimes in Kelantan, use very targeted filtering
|
| 748 |
+
kelantan_keywords = ["kelantan", "kelantanese"]
|
| 749 |
+
crime_keywords = ["rogol", "sumbang mahram", "jenayah seksual", "kes", "polis", "pdrm"]
|
| 750 |
+
|
| 751 |
+
# Must have at least one Kelantan reference AND one crime reference to be relevant
|
| 752 |
+
has_kelantan_ref = any(k in text_lower for k in kelantan_keywords)
|
| 753 |
+
has_crime_ref = any(k in text_lower for k in crime_keywords)
|
| 754 |
+
|
| 755 |
+
if has_kelantan_ref and has_crime_ref:
|
| 756 |
+
return True
|
| 757 |
+
|
| 758 |
+
# Check if username is from a relevant authority
|
| 759 |
+
authority_users = ["polis", "pdrm", "kelantan", "bukit aman", "bernama", "berita"]
|
| 760 |
+
if any(k in user_lower for k in authority_users):
|
| 761 |
+
return True
|
| 762 |
+
|
| 763 |
+
# More restrictive for this specific claim - return False if not matching criteria
|
| 764 |
+
return False
|
| 765 |
+
else:
|
| 766 |
+
# General Malaysian content detection for other claims
|
| 767 |
+
# Keywords for crime-related content
|
| 768 |
+
crime_keywords = [
|
| 769 |
+
"polis", "kelantan", "jenayah", "rogol", "sumbang mahram", "inses",
|
| 770 |
+
"kes", "statistik", "bimbang", "pdrm", "malaysia", "undang-undang",
|
| 771 |
+
"mahkamah", "hukuman", "tangkap", "siasat", "lapor", "mangsa", "suspek",
|
| 772 |
+
"tertuduh", "penderaan", "seksual", "cabul", "gangguan"
|
| 773 |
+
]
|
| 774 |
+
|
| 775 |
+
# Check if any crime keywords are in the text
|
| 776 |
+
if any(k in text_lower for k in crime_keywords):
|
| 777 |
+
return True
|
| 778 |
+
|
| 779 |
+
# Check if username looks Malaysian
|
| 780 |
+
malaysian_user_indicators = [
|
| 781 |
+
"my", "ms", "malaysia", "officialmy", "rakyat", "malay",
|
| 782 |
+
"dr", "dato", "yb", "ustaz", "cikgu", "polis", "kelantan"
|
| 783 |
+
]
|
| 784 |
+
|
| 785 |
+
if any(k in user_lower for k in malaysian_user_indicators):
|
| 786 |
+
return True
|
| 787 |
+
|
| 788 |
+
# Default to True for now to maximize data collection, but with better filtering
|
| 789 |
+
return True
|
| 790 |
+
|
| 791 |
+
|
| 792 |
+
|
| 793 |
+
def download_dataset(dataset_id, platform="facebook", timeout=30, max_retries=3, use_cache=True, cache_ttl_hours=24):
|
| 794 |
+
# Check if we have a cached dataset
|
| 795 |
+
cache_file = os.path.join(CACHE_DIR, f"dataset_{dataset_id}.json")
|
| 796 |
+
|
| 797 |
+
if use_cache and os.path.exists(cache_file):
|
| 798 |
+
try:
|
| 799 |
+
with open(cache_file, 'r') as f:
|
| 800 |
+
cache_data = json.load(f)
|
| 801 |
+
|
| 802 |
+
# Check if cache is still valid
|
| 803 |
+
cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
|
| 804 |
+
cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
|
| 805 |
+
|
| 806 |
+
if datetime.now() < cache_expiry:
|
| 807 |
+
print(f"[💾] Using cached dataset {dataset_id} (expires {cache_expiry.isoformat()})")
|
| 808 |
+
return cache_data.get('data', [])
|
| 809 |
+
else:
|
| 810 |
+
print(f"[⏰] Cache expired for dataset {dataset_id}, fetching fresh data")
|
| 811 |
+
except Exception as e:
|
| 812 |
+
print(f"[⚠️] Error reading dataset cache: {str(e)}")
|
| 813 |
+
|
| 814 |
+
token = APIFY_TOKEN_FB if platform == "facebook" else APIFY_TOKEN_TIKTOK
|
| 815 |
+
headers = {
|
| 816 |
+
"Authorization": f"Bearer {token}"
|
| 817 |
+
}
|
| 818 |
+
dataset_url = f"https://api.apify.com/v2/datasets/{dataset_id}/items?clean=true&format=json"
|
| 819 |
+
|
| 820 |
+
# Try multiple times in case of network issues
|
| 821 |
+
for attempt in range(max_retries):
|
| 822 |
+
try:
|
| 823 |
+
print(f"[🔄] Attempt {attempt+1}/{max_retries} to download dataset {dataset_id}...")
|
| 824 |
+
response = requests.get(dataset_url, headers=headers, timeout=timeout)
|
| 825 |
+
|
| 826 |
+
if response.status_code != 200:
|
| 827 |
+
print(f"[❌] Failed to download dataset: {response.text}")
|
| 828 |
+
if attempt < max_retries - 1:
|
| 829 |
+
print("[⏳] Retrying...")
|
| 830 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 831 |
+
continue
|
| 832 |
+
raise Exception(f"Dataset download failed after {max_retries} attempts.")
|
| 833 |
+
|
| 834 |
+
data = response.json()
|
| 835 |
+
print(f"[✓] Downloaded {len(data)} items from dataset {dataset_id}")
|
| 836 |
+
|
| 837 |
+
# Save dataset to cache
|
| 838 |
+
if use_cache:
|
| 839 |
+
try:
|
| 840 |
+
cache_data = {
|
| 841 |
+
"data": data,
|
| 842 |
+
"timestamp": datetime.now().isoformat(),
|
| 843 |
+
"dataset_id": dataset_id,
|
| 844 |
+
"platform": platform
|
| 845 |
+
}
|
| 846 |
+
|
| 847 |
+
with open(cache_file, 'w') as f:
|
| 848 |
+
json.dump(cache_data, f)
|
| 849 |
+
|
| 850 |
+
print(f"[💾] Saved dataset to cache: {cache_file}")
|
| 851 |
+
except Exception as e:
|
| 852 |
+
print(f"[⚠️] Error saving dataset to cache: {str(e)}")
|
| 853 |
+
|
| 854 |
+
return data
|
| 855 |
+
except requests.exceptions.Timeout:
|
| 856 |
+
print(f"[❌] Request timed out after {timeout} seconds")
|
| 857 |
+
if attempt < max_retries - 1:
|
| 858 |
+
print("[⏳] Retrying...")
|
| 859 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 860 |
+
else:
|
| 861 |
+
raise Exception(f"Dataset download timed out after {max_retries} attempts.")
|
| 862 |
+
except requests.exceptions.ConnectionError:
|
| 863 |
+
print(f"[❌] Connection error")
|
| 864 |
+
if attempt < max_retries - 1:
|
| 865 |
+
print("[⏳] Retrying...")
|
| 866 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 867 |
+
else:
|
| 868 |
+
raise Exception(f"Connection error after {max_retries} attempts.")
|
| 869 |
+
except Exception as e:
|
| 870 |
+
print(f"[❌] Unexpected error: {str(e)}")
|
| 871 |
+
if attempt < max_retries - 1:
|
| 872 |
+
print("[⏳] Retrying...")
|
| 873 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 874 |
+
else:
|
| 875 |
+
raise Exception(f"Unexpected error after {max_retries} attempts: {str(e)}")
|
| 876 |
+
|
| 877 |
+
# If we get here, all retries failed
|
| 878 |
+
return []
|
| 879 |
+
|
| 880 |
+
def build_boolean_search(keywords):
|
| 881 |
+
"""Build an optimized search query for social media platforms"""
|
| 882 |
+
search_terms = []
|
| 883 |
+
|
| 884 |
+
for kw in keywords:
|
| 885 |
+
# If keyword contains spaces (multi-word phrase), wrap in quotes
|
| 886 |
+
if " " in kw:
|
| 887 |
+
search_terms.append(f'"{kw}"')
|
| 888 |
+
else:
|
| 889 |
+
# For single words, don't use quotes to get broader results
|
| 890 |
+
search_terms.append(kw)
|
| 891 |
+
|
| 892 |
+
return " OR ".join(search_terms)
|
| 893 |
+
|
ai_api/library/config.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
config.py
|
| 3 |
+
Central configuration for the claim analysis system
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# Base directories
|
| 9 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 10 |
+
DATA_DIR = os.path.join(BASE_DIR, "data")
|
| 11 |
+
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
|
| 12 |
+
REPORTS_DIR = os.path.join(BASE_DIR, "reports")
|
| 13 |
+
|
| 14 |
+
# Create directories if they don't exist
|
| 15 |
+
for directory in [DATA_DIR, OUTPUT_DIR, REPORTS_DIR]:
|
| 16 |
+
os.makedirs(directory, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
# API Keys
|
| 19 |
+
GOOGLE_API_KEY = "AIzaSyAnXTkB_0HKXKul3eI-1A56ZQWyjTVj1cQ" # Google Custom Search API key
|
| 20 |
+
GOOGLE_SEARCH_ENGINE_ID = "e7e6c19ee7a984f30" # Add your search engine ID here (you'll need to create this)
|
| 21 |
+
|
| 22 |
+
# Serper.dev API Key (alternative search API)
|
| 23 |
+
SERPER_API_KEY = "e0af440fd71fb125dd38644fe378831c3ed741ca"
|
| 24 |
+
|
| 25 |
+
# SerpApi Google Search API Key
|
| 26 |
+
SERPAPI_API_KEY = "007928aeb7d86d4a85af12728e3534163961837027afb63ec7b89a4624a9f4ac"
|
| 27 |
+
|
| 28 |
+
# Data source settings
|
| 29 |
+
USE_FACEBOOK = False # Disable Facebook data collection
|
| 30 |
+
USE_TIKTOK = True # Enable TikTok data collection
|
| 31 |
+
USE_SERPAPI = True # Enable SerpApi web search
|
| 32 |
+
USE_SERPER = True # Enable Serper.dev web search
|
| 33 |
+
USE_DUCKDUCKGO = False # Disable DuckDuckGo web search
|
| 34 |
+
USE_LOWYAT = True # Enable Lowyat Forum data collection
|
| 35 |
+
|
| 36 |
+
# Number of results to collect from each source
|
| 37 |
+
FACEBOOK_MAX_RESULTS = 100
|
| 38 |
+
TIKTOK_MAX_RESULTS = 10 # Significantly reduced to save Apify costs
|
| 39 |
+
WEB_SEARCH_MAX_RESULTS = 20
|
| 40 |
+
LOWYAT_MAX_THREADS = 20 # Maximum number of Lowyat Forum threads to collect
|
| 41 |
+
|
| 42 |
+
# Lowyat Forum settings
|
| 43 |
+
LOWYAT_SECTIONS = [
|
| 44 |
+
"Kopitiam", "SeriousKopitiam", "News", "Politics", "Malaysia", "Lowyat.NET",
|
| 45 |
+
"Technology", "Computers", "Notebooks", "Smartphones", "Photography", "GamingPC", "GamingConsole",
|
| 46 |
+
"Automotive", "Finance", "Property", "Travel", "Food", "Health", "Sports", "Entertainment",
|
| 47 |
+
"SpecialInterestGarageSales", "JobsCorner", "DigitalMarketplace"
|
| 48 |
+
] # All available forum sections
|
| 49 |
+
|
| 50 |
+
# Social Media API tokens
|
| 51 |
+
APIFY_TOKEN = "apify_api_INtF6uUT4c6nOStYDYTllxuTBNSbng1IlTTB" # Main Apify API token
|
| 52 |
+
APIFY_TOKEN_FB = APIFY_TOKEN # For Facebook actors
|
| 53 |
+
APIFY_TOKEN_TIKTOK = APIFY_TOKEN # For TikTok actors
|
| 54 |
+
|
| 55 |
+
# Actor task IDs
|
| 56 |
+
# From danek/facebook-search-ppr
|
| 57 |
+
POST_TASK_ID_SEARCH = "l5DitJrtfCyOfrjn6" # Facebook Search PPR (rajamohd/facebook-search-ppr-rm-bernama)
|
| 58 |
+
|
| 59 |
+
# From datavoyantlab/facebook-comments-scraper
|
| 60 |
+
COMMENT_TASK_ID = "qiAp6PQwkyYcLQiyC" # Facebook Comments Scraper (rajamohd/facebook-comments-scraper-task)
|
| 61 |
+
|
| 62 |
+
# From clockworks/free-tiktok-scraper
|
| 63 |
+
TIKTOK_VIDEO_TASK_ID = "rfk0BzRAjuLPbccaZ" # TikTok Data Extractor (devlab/tiktok-data-extractor-bernama2-video)
|
| 64 |
+
|
| 65 |
+
# From clockworks/tiktok-comments-scraper
|
| 66 |
+
TIKTOK_COMMENT_TASK_ID = "rgXeWIhnXKRD5bjGp" # TikTok Comments Scraper (devlab/tiktok-comments-scraper-bernama2)
|
| 67 |
+
|
| 68 |
+
# Apify settings
|
| 69 |
+
USE_COMMENTS = True # Whether to collect comments in addition to posts/videos
|
| 70 |
+
|
| 71 |
+
# Sentiment model
|
| 72 |
+
SENTIMENT_MODEL = "rmtariq/ft-Malay-bert"
|
| 73 |
+
|
| 74 |
+
# Priority indexer settings
|
| 75 |
+
PRIORITY_WEIGHTS = {
|
| 76 |
+
"fact_check_value": 1.5, # Higher weight for factual importance
|
| 77 |
+
"cause_confusion": 1.2, # Medium-high weight for confusion potential
|
| 78 |
+
"cause_chaos": 1.8, # High weight for potential harm
|
| 79 |
+
"affects_government": 1.3, # Medium-high for government impact
|
| 80 |
+
"economic_impact": 1.4, # Medium-high for economic impact
|
| 81 |
+
"law_related": 1.5, # Higher weight for legal implications
|
| 82 |
+
"public_interest": 1.2, # Medium weight for public interest
|
| 83 |
+
"lives_in_danger": 2.0, # Highest weight for safety concerns
|
| 84 |
+
"viral": 1.1, # Lower weight for virality alone
|
| 85 |
+
"urgent": 1.3 # Medium-high for urgency
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
PRIORITY_THRESHOLDS = {
|
| 89 |
+
"high_priority": 7.0,
|
| 90 |
+
"medium_priority": 5.0,
|
| 91 |
+
"low_priority": 3.0
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Classification settings
|
| 95 |
+
VERDICT_CATEGORIES = {
|
| 96 |
+
"TIDAK_BENAR": {
|
| 97 |
+
"name": "TIDAK BENAR",
|
| 98 |
+
"description": "Dakwaan ini tidak benar berdasarkan bukti yang ada.",
|
| 99 |
+
"threshold": 7.0,
|
| 100 |
+
"conditions": ["fact_check_value", "law_related"]
|
| 101 |
+
},
|
| 102 |
+
"BERCAMPUR": {
|
| 103 |
+
"name": "BERCAMPUR",
|
| 104 |
+
"description": "Dakwaan ini mengandungi unsur-unsur benar dan tidak benar.",
|
| 105 |
+
"threshold": 5.0,
|
| 106 |
+
"conditions": ["cause_confusion"]
|
| 107 |
+
},
|
| 108 |
+
"BENAR": {
|
| 109 |
+
"name": "BENAR",
|
| 110 |
+
"description": "Dakwaan ini benar berdasarkan bukti yang ada.",
|
| 111 |
+
"threshold": 3.0,
|
| 112 |
+
"conditions": []
|
| 113 |
+
},
|
| 114 |
+
"TIDAK_PASTI": {
|
| 115 |
+
"name": "TIDAK PASTI",
|
| 116 |
+
"description": "Tidak cukup bukti untuk menentukan kebenaran dakwaan ini.",
|
| 117 |
+
"threshold": 0.0,
|
| 118 |
+
"conditions": []
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
# Database settings
|
| 123 |
+
DB_PATH = os.path.join(DATA_DIR, "claims.db")
|
| 124 |
+
|
| 125 |
+
# Malaysian filter settings
|
| 126 |
+
MALAYSIAN_FILTER_THRESHOLD = 0.5 # Confidence threshold for Malaysian content
|
| 127 |
+
|
| 128 |
+
# Report settings
|
| 129 |
+
REPORT_TEMPLATE = None # Path to DOCX template (optional)
|
| 130 |
+
GOOGLE_SEARCH_ENGINE_ID = "e7e6c19ee7a984f30" # Google Search Engine ID
|
| 131 |
+
|
ai_api/library/devlab_image.py
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from PIL.ExifTags import TAGS
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 8 |
+
import torch
|
| 9 |
+
import requests
|
| 10 |
+
import base64
|
| 11 |
+
from selenium import webdriver
|
| 12 |
+
from selenium.webdriver.common.by import By
|
| 13 |
+
from selenium.webdriver.common.keys import Keys
|
| 14 |
+
from selenium.webdriver.chrome.service import Service
|
| 15 |
+
from selenium.webdriver.chrome.options import Options
|
| 16 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 17 |
+
from bs4 import BeautifulSoup
|
| 18 |
+
import urllib.parse
|
| 19 |
+
import time
|
| 20 |
+
from deepface import DeepFace
|
| 21 |
+
from pymilvus import Collection, connections, CollectionSchema, FieldSchema, DataType
|
| 22 |
+
import numpy as np
|
| 23 |
+
# import faiss
|
| 24 |
+
import os
|
| 25 |
+
import pickle
|
| 26 |
+
import pprint
|
| 27 |
+
import cv2
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
milvus_host = os.getenv("MILVUS_HOST", "localhost") # default localhost
|
| 33 |
+
milvus_port = os.getenv("MILVUS_PORT", "19530") # default 19530
|
| 34 |
+
|
| 35 |
+
connections.connect("default", host=milvus_host, port=int(milvus_port))
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 41 |
+
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 42 |
+
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 43 |
+
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
ES_HOST = "https://localhost:9200"
|
| 47 |
+
ES_USER = "elastic"
|
| 48 |
+
ES_PASS = "qR_BblnAzT-1pOQgFRvZ"
|
| 49 |
+
ES_INDEX = "faces"
|
| 50 |
+
|
| 51 |
+
class DevLabImage :
|
| 52 |
+
|
| 53 |
+
def __init__(self, image_path = None):
|
| 54 |
+
self.image_path = image_path
|
| 55 |
+
|
| 56 |
+
def sanitize_name(self, title, replace ='_'):
|
| 57 |
+
import re
|
| 58 |
+
title = re.sub(r'\s+', ' ', title).strip()
|
| 59 |
+
return re.sub(r'[\\/*?:"<>|]', replace, title)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def extract_text(self, image_path):
|
| 63 |
+
import easyocr
|
| 64 |
+
reader = easyocr.Reader(["en", "ms"]) # English & Malay
|
| 65 |
+
text = reader.readtext(image_path, detail=0)
|
| 66 |
+
return " ".join(text)
|
| 67 |
+
|
| 68 |
+
def extract_text_numpy(self, np_array):
|
| 69 |
+
import easyocr
|
| 70 |
+
reader = easyocr.Reader(["en", "ms"]) # English & Malay
|
| 71 |
+
text = reader.readtext(np_array, detail=0)
|
| 72 |
+
return text
|
| 73 |
+
|
| 74 |
+
# def get_emotions(self):
|
| 75 |
+
# from deepface import DeepFace
|
| 76 |
+
# return DeepFace.analyze(self.image_path, actions=['emotion'])
|
| 77 |
+
|
| 78 |
+
def extract_exif(self, image_path):
|
| 79 |
+
"""Extract EXIF metadata from an image"""
|
| 80 |
+
|
| 81 |
+
image = Image.open(image_path)
|
| 82 |
+
exif_data = image._getexif()
|
| 83 |
+
|
| 84 |
+
metadata = {}
|
| 85 |
+
if exif_data:
|
| 86 |
+
for tag, value in exif_data.items():
|
| 87 |
+
tag_name = TAGS.get(tag, tag)
|
| 88 |
+
metadata[tag_name] = value
|
| 89 |
+
|
| 90 |
+
return metadata
|
| 91 |
+
|
| 92 |
+
def extract_metadata_exiftool(self,image_path):
|
| 93 |
+
"""Extract IPTC, XMP, and EXIF metadata using ExifTool"""
|
| 94 |
+
|
| 95 |
+
command = ["exiftool", "-j", image_path]
|
| 96 |
+
result = subprocess.run(command, capture_output=True, text=True)
|
| 97 |
+
metadata = json.loads(result.stdout)[0] if result.stdout else {}
|
| 98 |
+
|
| 99 |
+
return metadata
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def generate_description_blip(self, image_path):
|
| 103 |
+
"""Generate an image description using BLIP"""
|
| 104 |
+
|
| 105 |
+
image = Image.open(image_path).convert("RGB")
|
| 106 |
+
inputs = blip_processor(image, return_tensors="pt")
|
| 107 |
+
out = blip_model.generate(**inputs)
|
| 108 |
+
return blip_processor.decode(out[0], skip_special_tokens=True)
|
| 109 |
+
|
| 110 |
+
def extract_image_features(self,image_path):
|
| 111 |
+
"""Extract image embeddings using CLIP"""
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
image = Image.open(image_path)
|
| 115 |
+
inputs = clip_processor(images=image, return_tensors="pt")
|
| 116 |
+
with torch.no_grad():
|
| 117 |
+
features = clip_model.get_image_features(**inputs)
|
| 118 |
+
return features.squeeze().numpy()
|
| 119 |
+
|
| 120 |
+
# def download_google(self,arguments):
|
| 121 |
+
# """Download from Google"""
|
| 122 |
+
# response = google_images_download.googleimagesdownload()
|
| 123 |
+
# response.download(arguments)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# def download_person(self,person_name):
|
| 127 |
+
# # Define the emotions to search
|
| 128 |
+
# emotions = ["happy", "sad", "angry", "surprised"]
|
| 129 |
+
|
| 130 |
+
# for emotion in emotions:
|
| 131 |
+
# arguments = {
|
| 132 |
+
# "keywords": f"{person_name} {emotion} face",
|
| 133 |
+
# "limit": 10, # Download 10 images per emotion
|
| 134 |
+
# "print_urls": True,
|
| 135 |
+
# "format": "jpg",
|
| 136 |
+
# "output_directory": "people",
|
| 137 |
+
# "image_directory": self.sanitize_name(person_name, ' ') # Save into separate folders per emotion
|
| 138 |
+
# }
|
| 139 |
+
# self.download_google(arguments)
|
| 140 |
+
|
| 141 |
+
def download_image(self, url, folder, image_name):
|
| 142 |
+
"""Download and save the image."""
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
if url.startswith("data:image/"): # Base64 encoded image
|
| 146 |
+
header, encoded_data = url.split(",", 1)
|
| 147 |
+
extension = header.split(";")[0].split("/")[-1] # Extract file type (jpg, png, etc.)
|
| 148 |
+
image_path = os.path.join(folder, f"{image_name}.{extension}")
|
| 149 |
+
|
| 150 |
+
os.makedirs(folder, exist_ok=True)
|
| 151 |
+
with open(image_path, "wb") as file:
|
| 152 |
+
file.write(base64.b64decode(encoded_data))
|
| 153 |
+
|
| 154 |
+
print(f"✅ Base64 image saved: {image_path}")
|
| 155 |
+
|
| 156 |
+
else: # URL download
|
| 157 |
+
|
| 158 |
+
response = requests.get(url, stream=True, timeout=10)
|
| 159 |
+
if response.status_code == 200:
|
| 160 |
+
os.makedirs(folder, exist_ok=True)
|
| 161 |
+
image_path = os.path.join(folder, f"{image_name}.jpg")
|
| 162 |
+
with open(image_path, "wb") as file:
|
| 163 |
+
for chunk in response.iter_content(1024):
|
| 164 |
+
file.write(chunk)
|
| 165 |
+
print(f"✅ Downloaded: {image_path}")
|
| 166 |
+
else:
|
| 167 |
+
print(f"❌ Failed to download: {url}")
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"⚠ Error downloading {url}: {e}")
|
| 170 |
+
|
| 171 |
+
def has_min_img_size(self, tag, min_size=100):
|
| 172 |
+
img = tag.find("img")
|
| 173 |
+
if img and img.has_attr("width") and img.has_attr("height"):
|
| 174 |
+
try:
|
| 175 |
+
width = int(img["width"])
|
| 176 |
+
height = int(img["height"])
|
| 177 |
+
return width >= min_size and height >= min_size
|
| 178 |
+
except ValueError:
|
| 179 |
+
return False
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
def search_google_images(self, query, num_images=10):
|
| 183 |
+
|
| 184 |
+
# Set up Chrome WebDriver
|
| 185 |
+
options = Options()
|
| 186 |
+
options.binary_location = "/usr/bin/chromium" # important for Docker
|
| 187 |
+
options.add_argument("--headless") # Run in background
|
| 188 |
+
options.add_argument("--no-sandbox")
|
| 189 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 190 |
+
options.add_argument("--disable-gpu")
|
| 191 |
+
options.add_argument("--window-size=1920x1080")
|
| 192 |
+
|
| 193 |
+
# Create driver using installed chromedriver
|
| 194 |
+
driver = webdriver.Chrome(
|
| 195 |
+
service=Service("/usr/bin/chromedriver"), # use system-installed path
|
| 196 |
+
options=options
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
"""Search Google Images and extract image URLs."""
|
| 201 |
+
encoded_query = urllib.parse.quote(query)
|
| 202 |
+
search_url = f"https://www.google.com/search?q={encoded_query}&tbm=isch&sclient=img"
|
| 203 |
+
|
| 204 |
+
print(f"🔍 Searching for: {query}")
|
| 205 |
+
|
| 206 |
+
driver.get(search_url)
|
| 207 |
+
time.sleep(2) # Wait for page to load
|
| 208 |
+
|
| 209 |
+
list_items = driver.find_elements(By.CSS_SELECTOR, "div[role='listitem']")
|
| 210 |
+
list_items[1].click()
|
| 211 |
+
time.sleep(3) # Wait for page to load
|
| 212 |
+
|
| 213 |
+
# Scroll to load more images
|
| 214 |
+
for _ in range(3):
|
| 215 |
+
driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
|
| 216 |
+
time.sleep(2)
|
| 217 |
+
|
| 218 |
+
# Extract image URLs
|
| 219 |
+
soup = BeautifulSoup(driver.page_source, "html.parser")
|
| 220 |
+
|
| 221 |
+
# target_div = soup.find("div", {"id":query})
|
| 222 |
+
|
| 223 |
+
# # Extract all <img> tags inside the div
|
| 224 |
+
# if target_div:
|
| 225 |
+
# images = target_div.find_all("img")
|
| 226 |
+
# # images = soup.find_all("img")
|
| 227 |
+
# else:
|
| 228 |
+
# images = soup.select("g-img img")
|
| 229 |
+
# g_imgs = [g for g in soup.find_all("g-img") if g.get("style") not in ("width:12px;height:12px", "width:46px;height:46px")]
|
| 230 |
+
g_imgs = [g for g in soup.find_all("g-img") if self.has_min_img_size(g)]
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# g_imgs = soup.select("g-img")
|
| 234 |
+
|
| 235 |
+
# print(g_imgs)
|
| 236 |
+
# driver.quit()
|
| 237 |
+
# return
|
| 238 |
+
|
| 239 |
+
image_urls = []
|
| 240 |
+
for gimg in g_imgs:
|
| 241 |
+
if len(image_urls) >= num_images:
|
| 242 |
+
break
|
| 243 |
+
img = gimg.find('img')
|
| 244 |
+
src = img.get("src")
|
| 245 |
+
|
| 246 |
+
if src.startswith("data:image/"):
|
| 247 |
+
mime_type = src.split(";")[0].split(":")[1] # Extract MIME type
|
| 248 |
+
file_extension = mime_type.split("/")[-1] # Extract file extension
|
| 249 |
+
else:
|
| 250 |
+
file_extension = src.split(".")[-1].split("?")[0].lower() # Extract file extension from URL
|
| 251 |
+
|
| 252 |
+
# Skip GIFs
|
| 253 |
+
if file_extension == "gif":
|
| 254 |
+
continue
|
| 255 |
+
# if not src or not src.startswith("data:image/"):
|
| 256 |
+
# continue
|
| 257 |
+
|
| 258 |
+
# mime_type = src.split(";")[0].split(":")[1]
|
| 259 |
+
# file_extension = mime_type.split("/")[-1]
|
| 260 |
+
# if file_extension == "gif":
|
| 261 |
+
# continue
|
| 262 |
+
|
| 263 |
+
image_urls.append(src)
|
| 264 |
+
|
| 265 |
+
print(f"✅ Found {len(image_urls)} images for {query}")
|
| 266 |
+
driver.quit()
|
| 267 |
+
return image_urls
|
| 268 |
+
|
| 269 |
+
def download_person_images(self, person_name, tags = None):
|
| 270 |
+
"""Download images for a person with different emotions."""
|
| 271 |
+
emotions = ["happy", "sad", "angry", "surprised"]
|
| 272 |
+
foldername = self.sanitize_name(person_name, ' ')
|
| 273 |
+
# filename = self.sanitize_name(person_name)
|
| 274 |
+
# for emotion in emotions:
|
| 275 |
+
# folder = f"people/{foldername}"
|
| 276 |
+
# image_urls = self.search_google_images(person_name, emotion)
|
| 277 |
+
|
| 278 |
+
# for i, url in enumerate(image_urls):
|
| 279 |
+
# self.download_image(url, folder, f"{emotion}{i+1}")
|
| 280 |
+
|
| 281 |
+
folder = f"people/{foldername}"
|
| 282 |
+
# query = f"{person_name} headshot OR close-up HD -group -friends -couple -family -crowd -far -selfie {tags}"
|
| 283 |
+
# query = f"'{person_name}' headshot OR close-up HD medium size {tags}"
|
| 284 |
+
# query = f"'{person_name}' official portrait large size"
|
| 285 |
+
query = f"'{person_name}' portrait {tags}"
|
| 286 |
+
|
| 287 |
+
image_urls = self.search_google_images(query, 5)
|
| 288 |
+
for i, url in enumerate(image_urls):
|
| 289 |
+
self.download_image(url, folder, f"{i+1}")
|
| 290 |
+
|
| 291 |
+
return foldername
|
| 292 |
+
|
| 293 |
+
def extract_face(self, person, tags):
|
| 294 |
+
|
| 295 |
+
try:
|
| 296 |
+
collection = Collection("faces")
|
| 297 |
+
collection.load() # Try loading the collection to check if it exists
|
| 298 |
+
print("Collection 'faces' already exists.")
|
| 299 |
+
except Exception as e:
|
| 300 |
+
# If collection doesn't exist, create it
|
| 301 |
+
print(f"Creating collection: {e}")
|
| 302 |
+
fields = [
|
| 303 |
+
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
| 304 |
+
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
|
| 305 |
+
FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=255),
|
| 306 |
+
FieldSchema(name="short_description", dtype=DataType.VARCHAR, max_length=255),
|
| 307 |
+
FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=5000),
|
| 308 |
+
]
|
| 309 |
+
schema = CollectionSchema(fields, description="Face embeddings")
|
| 310 |
+
collection = Collection(name="faces", schema=schema)
|
| 311 |
+
collection.create_index(field_name="embedding", index_params={"metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 32, "efConstruction": 512}})
|
| 312 |
+
collection.load()
|
| 313 |
+
|
| 314 |
+
dataset_path = "people/"
|
| 315 |
+
person_path = os.path.join(dataset_path, person)
|
| 316 |
+
print(person_path)
|
| 317 |
+
|
| 318 |
+
if not os.path.isdir(person_path):
|
| 319 |
+
return
|
| 320 |
+
|
| 321 |
+
image_files = [f for f in os.listdir(person_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
|
| 322 |
+
|
| 323 |
+
for img in image_files:
|
| 324 |
+
img_path = os.path.join(person_path, img)
|
| 325 |
+
try:
|
| 326 |
+
embedding = self.extract_embedding(image_path=img_path)
|
| 327 |
+
if embedding is not None:
|
| 328 |
+
emb = np.array(embedding, dtype=np.float32)
|
| 329 |
+
if emb.size > 0:
|
| 330 |
+
collection.insert([[emb], [person], [tags], ['']])
|
| 331 |
+
print(f"{person} registered")
|
| 332 |
+
else:
|
| 333 |
+
print(f"No embedding found for {img_path}")
|
| 334 |
+
|
| 335 |
+
except Exception as e:
|
| 336 |
+
print(f"Could not process {img_path}: {str(e)}")
|
| 337 |
+
|
| 338 |
+
def register_person(self, person_name, tags = ''):
|
| 339 |
+
"""Register a person with their images."""
|
| 340 |
+
folder = self.download_person_images(person_name, tags)
|
| 341 |
+
self.extract_face(folder,tags)
|
| 342 |
+
|
| 343 |
+
def query_embedding(self,query_embedding, top_k=5):
|
| 344 |
+
|
| 345 |
+
# Load the collection
|
| 346 |
+
try:
|
| 347 |
+
collection = Collection("faces")
|
| 348 |
+
collection.load() # Try loading the collection to check if it exists
|
| 349 |
+
print("Collection 'faces' already exists.")
|
| 350 |
+
except Exception as e:
|
| 351 |
+
# If collection doesn't exist, create it
|
| 352 |
+
print(f"Creating collection: {e}")
|
| 353 |
+
fields = [
|
| 354 |
+
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
| 355 |
+
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
|
| 356 |
+
FieldSchema(name="name", dtype=DataType.VARCHAR, max_length=255),
|
| 357 |
+
FieldSchema(name="short_description", dtype=DataType.VARCHAR, max_length=255),
|
| 358 |
+
FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=5000),
|
| 359 |
+
]
|
| 360 |
+
schema = CollectionSchema(fields, description="Face embeddings")
|
| 361 |
+
collection = Collection(name="faces", schema=schema)
|
| 362 |
+
collection.create_index(field_name="embedding", index_params={"metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 32, "efConstruction": 512}})
|
| 363 |
+
collection.load()
|
| 364 |
+
|
| 365 |
+
# query_embedding = self.extract_embedding(query_image_path)
|
| 366 |
+
# if query_embedding is None:
|
| 367 |
+
# print("No embedding extracted for the query image.")
|
| 368 |
+
# return None
|
| 369 |
+
|
| 370 |
+
# Convert the query embedding to a numpy array
|
| 371 |
+
query_emb = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
|
| 372 |
+
params = {"metric_type": "COSINE", "params": {"efTopK": top_k}}
|
| 373 |
+
|
| 374 |
+
search_results = collection.search(query_emb, "embedding", output_fields=["id", "name","short_description","description"], param=params, limit=top_k)
|
| 375 |
+
|
| 376 |
+
return search_results
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def extract_embedding(self, image_path):
|
| 380 |
+
try:
|
| 381 |
+
faces = DeepFace.represent(image_path, model_name="Facenet", enforce_detection=False)
|
| 382 |
+
|
| 383 |
+
if faces:
|
| 384 |
+
return faces[0]["embedding"]
|
| 385 |
+
else:
|
| 386 |
+
return None
|
| 387 |
+
|
| 388 |
+
except Exception as e:
|
| 389 |
+
print(f"Failed on {image_path}: {e}")
|
| 390 |
+
return None
|
| 391 |
+
|
| 392 |
+
def detect_faces(self):
|
| 393 |
+
|
| 394 |
+
image = cv2.imread(self.image_path)
|
| 395 |
+
|
| 396 |
+
face_embeddings = DeepFace.represent(self.image_path, model_name="Facenet", enforce_detection=False)
|
| 397 |
+
|
| 398 |
+
if not face_embeddings: # No faces detected
|
| 399 |
+
return "❌ No faces detected in the image."
|
| 400 |
+
|
| 401 |
+
recognized_faces = {}
|
| 402 |
+
|
| 403 |
+
for face_data in face_embeddings:
|
| 404 |
+
# print(face_data)
|
| 405 |
+
face_embedding = np.array(face_data["embedding"]).tolist()
|
| 406 |
+
|
| 407 |
+
face_location = face_data["facial_area"]
|
| 408 |
+
# face_location = face_data["region"]
|
| 409 |
+
|
| 410 |
+
x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
|
| 411 |
+
clipped_face = image[y:y+h, x:x+w]
|
| 412 |
+
|
| 413 |
+
# The search query using cosine similarity
|
| 414 |
+
query = {
|
| 415 |
+
"size": 1,
|
| 416 |
+
"query": {
|
| 417 |
+
"script_score": {
|
| 418 |
+
"query": {"match_all": {}}, # Match all documents
|
| 419 |
+
"script": {
|
| 420 |
+
"source": "(cosineSimilarity(params.query_vector, 'embedding') + 1) / 2", # Cosine similarity formula
|
| 421 |
+
"params": {
|
| 422 |
+
"query_vector": face_embedding # The face embedding you want to compare
|
| 423 |
+
}
|
| 424 |
+
}
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
# Perform the POST request to Elasticsearch
|
| 430 |
+
response = requests.post(
|
| 431 |
+
f"{ES_HOST}/{ES_INDEX}/_search",
|
| 432 |
+
headers={"Content-Type": "application/json"},
|
| 433 |
+
auth=(ES_USER, ES_PASS),
|
| 434 |
+
json=query,
|
| 435 |
+
verify=False # Disable SSL verification for testing (in production, use SSL)
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Check if the request was successful
|
| 439 |
+
if response.status_code == 200:
|
| 440 |
+
# return response.json()
|
| 441 |
+
results = response.json()
|
| 442 |
+
# pprint.pprint(results)
|
| 443 |
+
if results['hits']['hits']:
|
| 444 |
+
name = results['hits']['hits'][0]['_source']['name']
|
| 445 |
+
recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {"name": name, "image": clipped_face, "score": results['hits']['hits'][0]['_score']}
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
return recognized_faces
|
| 449 |
+
|
| 450 |
+
def delete_person(self, person):
|
| 451 |
+
import requests
|
| 452 |
+
import json
|
| 453 |
+
|
| 454 |
+
delete_query = {
|
| 455 |
+
"query": {
|
| 456 |
+
"term": {
|
| 457 |
+
"name": person # Field to match and its value
|
| 458 |
+
}
|
| 459 |
+
}
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
# Send the DELETE request to Elasticsearch
|
| 463 |
+
response = requests.post(
|
| 464 |
+
f"{ES_HOST}/{ES_INDEX}/_delete_by_query",
|
| 465 |
+
auth=(ES_USER, ES_PASS),
|
| 466 |
+
headers={"Content-Type": "application/json"},
|
| 467 |
+
data=json.dumps(delete_query),
|
| 468 |
+
verify=False # Disable SSL verification for testing (use True in production)
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
# Check if the request was successful
|
| 472 |
+
if response.status_code == 200:
|
| 473 |
+
print(f"Documents with name = {person} deleted successfully.")
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
def analyze(self):
|
| 477 |
+
analysis = DeepFace.analyze(self.image_path, actions= ['age', 'gender', 'race', 'emotion'])
|
| 478 |
+
return analysis[0]
|
| 479 |
+
|
| 480 |
+
def reverse_search(self, image_path):
|
| 481 |
+
from reverse_image_search import reverse_image_search
|
| 482 |
+
|
| 483 |
+
return reverse_image_search(image_path, engines=["google", "yandex"])
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
|
ai_api/library/lowyat_crawler.py
ADDED
|
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# lowyat_crawler.py
|
| 2 |
+
# Crawler for Lowyat Forum data
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import time
|
| 8 |
+
import random
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import hashlib
|
| 12 |
+
from datetime import datetime, timedelta
|
| 13 |
+
import re
|
| 14 |
+
|
| 15 |
+
# Create cache directory
|
| 16 |
+
CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache")
|
| 17 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 18 |
+
|
| 19 |
+
# Lowyat Forum base URL
|
| 20 |
+
LOWYAT_BASE_URL = "https://forum.lowyat.net"
|
| 21 |
+
|
| 22 |
+
# Forum section IDs
|
| 23 |
+
FORUM_SECTIONS = {
|
| 24 |
+
# Main Discussion Forums
|
| 25 |
+
"Kopitiam": "16", # General discussion
|
| 26 |
+
"SeriousKopitiam": "506", # Serious discussions
|
| 27 |
+
"News": "17", # News discussions
|
| 28 |
+
"Politics": "507", # Political discussions
|
| 29 |
+
"Malaysia": "508", # Malaysia-specific topics
|
| 30 |
+
"Lowyat.NET": "18", # Lowyat.NET related discussions
|
| 31 |
+
|
| 32 |
+
# Technology Forums
|
| 33 |
+
"Technology": "19", # Technology discussions
|
| 34 |
+
"Computers": "20", # Computer discussions
|
| 35 |
+
"Notebooks": "32", # Laptop discussions
|
| 36 |
+
"Smartphones": "22", # Smartphone discussions
|
| 37 |
+
"Photography": "29", # Photography discussions
|
| 38 |
+
"GamingPC": "503", # PC Gaming
|
| 39 |
+
"GamingConsole": "504", # Console Gaming
|
| 40 |
+
|
| 41 |
+
# Lifestyle Forums
|
| 42 |
+
"Automotive": "23", # Car and motorcycle discussions
|
| 43 |
+
"Finance": "24", # Financial discussions
|
| 44 |
+
"Property": "25", # Property discussions
|
| 45 |
+
"Travel": "26", # Travel discussions
|
| 46 |
+
"Food": "27", # Food discussions
|
| 47 |
+
"Health": "28", # Health discussions
|
| 48 |
+
"Sports": "30", # Sports discussions
|
| 49 |
+
"Entertainment": "31", # Entertainment discussions
|
| 50 |
+
|
| 51 |
+
# Marketplace Forums
|
| 52 |
+
"SpecialInterestGarageSales": "21", # Buy and sell
|
| 53 |
+
"JobsCorner": "33", # Job listings
|
| 54 |
+
"DigitalMarketplace": "34" # Digital marketplace
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
def get_forum_section_url(section_name):
|
| 58 |
+
"""Get the URL for a forum section"""
|
| 59 |
+
if section_name in FORUM_SECTIONS:
|
| 60 |
+
section_id = FORUM_SECTIONS[section_name]
|
| 61 |
+
return f"{LOWYAT_BASE_URL}/forums/{section_id}"
|
| 62 |
+
else:
|
| 63 |
+
# Assume it's a custom section name, try to search for it
|
| 64 |
+
return f"{LOWYAT_BASE_URL}/search/forums?q={section_name}"
|
| 65 |
+
|
| 66 |
+
def clean_text(text):
|
| 67 |
+
"""Clean text by removing extra whitespace"""
|
| 68 |
+
if not text:
|
| 69 |
+
return ""
|
| 70 |
+
return re.sub(r'\s+', ' ', text).strip()
|
| 71 |
+
|
| 72 |
+
def extract_date(date_str):
|
| 73 |
+
"""Extract and standardize date from Lowyat Forum date string"""
|
| 74 |
+
try:
|
| 75 |
+
# Handle various date formats
|
| 76 |
+
if "Today" in date_str or "Yesterday" in date_str:
|
| 77 |
+
# For relative dates, convert to actual date
|
| 78 |
+
today = datetime.now().date()
|
| 79 |
+
if "Yesterday" in date_str:
|
| 80 |
+
date = today - timedelta(days=1)
|
| 81 |
+
else:
|
| 82 |
+
date = today
|
| 83 |
+
|
| 84 |
+
# Extract time if available
|
| 85 |
+
time_match = re.search(r'(\d+:\d+\s*[AP]M)', date_str)
|
| 86 |
+
if time_match:
|
| 87 |
+
time_str = time_match.group(1)
|
| 88 |
+
return f"{date.isoformat()} {time_str}"
|
| 89 |
+
return date.isoformat()
|
| 90 |
+
else:
|
| 91 |
+
# Try to parse standard date formats
|
| 92 |
+
date_patterns = [
|
| 93 |
+
r'(\d{1,2}-\d{1,2}-\d{4})', # DD-MM-YYYY
|
| 94 |
+
r'(\d{1,2}/\d{1,2}/\d{4})', # DD/MM/YYYY
|
| 95 |
+
r'(\w+ \d{1,2}, \d{4})' # Month DD, YYYY
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
for pattern in date_patterns:
|
| 99 |
+
match = re.search(pattern, date_str)
|
| 100 |
+
if match:
|
| 101 |
+
return match.group(1)
|
| 102 |
+
|
| 103 |
+
# If no pattern matches, return the original string
|
| 104 |
+
return date_str
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"Error parsing date '{date_str}': {str(e)}")
|
| 107 |
+
return date_str
|
| 108 |
+
|
| 109 |
+
def search_lowyat_forum(keywords, sections=None, max_pages=3, max_threads=20, use_cache=True, cache_ttl_hours=24, verbose=True, use_mock_data=True):
|
| 110 |
+
"""
|
| 111 |
+
Search Lowyat Forum for threads matching keywords
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
keywords (list): List of keywords to search for
|
| 115 |
+
sections (list): List of forum sections to search in (default: ["Kopitiam", "SeriousKopitiam", "Finance"])
|
| 116 |
+
max_pages (int): Maximum number of search result pages to process
|
| 117 |
+
max_threads (int): Maximum number of threads to process
|
| 118 |
+
use_cache (bool): Whether to use cached results
|
| 119 |
+
cache_ttl_hours (int): How long to keep cached results valid
|
| 120 |
+
verbose (bool): Whether to print verbose output
|
| 121 |
+
use_mock_data (bool): Whether to use mock data if real data cannot be retrieved
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
list: List of thread data dictionaries
|
| 125 |
+
"""
|
| 126 |
+
if sections is None:
|
| 127 |
+
sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
|
| 128 |
+
|
| 129 |
+
# Generate cache key
|
| 130 |
+
cache_key = f"lowyat_{'_'.join(keywords)}_{'_'.join(sections)}_{max_pages}_{max_threads}"
|
| 131 |
+
cache_hash = hashlib.md5(cache_key.encode()).hexdigest()
|
| 132 |
+
cache_file = os.path.join(CACHE_DIR, f"lowyat_{cache_hash}.json")
|
| 133 |
+
|
| 134 |
+
# Check cache
|
| 135 |
+
if use_cache and os.path.exists(cache_file):
|
| 136 |
+
try:
|
| 137 |
+
with open(cache_file, 'r') as f:
|
| 138 |
+
cache_data = json.load(f)
|
| 139 |
+
|
| 140 |
+
# Check if cache is still valid
|
| 141 |
+
cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
|
| 142 |
+
cache_expiry = cache_time + timedelta(hours=cache_ttl_hours)
|
| 143 |
+
|
| 144 |
+
if datetime.now() < cache_expiry:
|
| 145 |
+
print(f"[💾] Using cached Lowyat Forum results (expires {cache_expiry.isoformat()})")
|
| 146 |
+
return cache_data.get('threads', [])
|
| 147 |
+
else:
|
| 148 |
+
print(f"[⏰] Cache expired for Lowyat Forum search, fetching fresh data")
|
| 149 |
+
except Exception as e:
|
| 150 |
+
print(f"[⚠️] Error reading Lowyat Forum cache: {str(e)}")
|
| 151 |
+
|
| 152 |
+
all_threads = []
|
| 153 |
+
threads_processed = 0
|
| 154 |
+
cloudflare_detected = False
|
| 155 |
+
|
| 156 |
+
# Process each section
|
| 157 |
+
for section in sections:
|
| 158 |
+
if threads_processed >= max_threads:
|
| 159 |
+
break
|
| 160 |
+
|
| 161 |
+
print(f"[🔍] Searching Lowyat Forum section: {section}")
|
| 162 |
+
section_url = get_forum_section_url(section)
|
| 163 |
+
|
| 164 |
+
# For each keyword, search the section
|
| 165 |
+
for keyword in keywords:
|
| 166 |
+
if threads_processed >= max_threads:
|
| 167 |
+
break
|
| 168 |
+
|
| 169 |
+
print(f"[🔍] Searching for keyword: {keyword}")
|
| 170 |
+
|
| 171 |
+
# Construct search URL
|
| 172 |
+
if "search" in section_url:
|
| 173 |
+
# Already a search URL, add the keyword
|
| 174 |
+
search_url = f"{section_url}+{keyword.replace(' ', '+')}"
|
| 175 |
+
else:
|
| 176 |
+
# Regular section URL, add search parameter
|
| 177 |
+
search_url = f"{section_url}/search?q={keyword.replace(' ', '+')}"
|
| 178 |
+
|
| 179 |
+
# Process search result pages
|
| 180 |
+
for page in range(1, max_pages + 1):
|
| 181 |
+
if threads_processed >= max_threads:
|
| 182 |
+
break
|
| 183 |
+
|
| 184 |
+
page_url = f"{search_url}&page={page}" if page > 1 else search_url
|
| 185 |
+
print(f"[🔍] Processing page {page}: {page_url}")
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
# Add random delay to avoid rate limiting
|
| 189 |
+
time.sleep(random.uniform(1, 3))
|
| 190 |
+
|
| 191 |
+
# Get search results page with enhanced headers
|
| 192 |
+
headers = {
|
| 193 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 194 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 195 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 196 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 197 |
+
'Connection': 'keep-alive',
|
| 198 |
+
'Upgrade-Insecure-Requests': '1',
|
| 199 |
+
'Cache-Control': 'max-age=0'
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
response = requests.get(page_url, headers=headers, timeout=10)
|
| 203 |
+
|
| 204 |
+
if response.status_code != 200:
|
| 205 |
+
print(f"[❌] Failed to get search results page: {response.status_code}")
|
| 206 |
+
break
|
| 207 |
+
|
| 208 |
+
if verbose:
|
| 209 |
+
print(f"[🔍] Response received: {len(response.text)} bytes")
|
| 210 |
+
|
| 211 |
+
# Check for Cloudflare protection
|
| 212 |
+
if "Cloudflare" in response.text and "challenge" in response.text:
|
| 213 |
+
print(f"[⚠️] Cloudflare protection detected. Cannot access forum content directly.")
|
| 214 |
+
cloudflare_detected = True
|
| 215 |
+
break
|
| 216 |
+
|
| 217 |
+
# Parse search results
|
| 218 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 219 |
+
thread_elements = soup.select('.structItem--thread')
|
| 220 |
+
|
| 221 |
+
if not thread_elements:
|
| 222 |
+
print(f"[⚠️] No threads found on page {page} for keyword '{keyword}' in section '{section}'")
|
| 223 |
+
|
| 224 |
+
if verbose:
|
| 225 |
+
# Print a snippet of the response to help debug
|
| 226 |
+
print(f"[🔍] Response snippet: {response.text[:500]}...")
|
| 227 |
+
|
| 228 |
+
# Check if we're getting a search results page at all
|
| 229 |
+
search_title = soup.select_one('title')
|
| 230 |
+
if search_title:
|
| 231 |
+
print(f"[🔍] Page title: {search_title.get_text()}")
|
| 232 |
+
|
| 233 |
+
# Check if there's a message about no results
|
| 234 |
+
no_results = soup.select_one('.block-row--message')
|
| 235 |
+
if no_results:
|
| 236 |
+
print(f"[🔍] Message: {no_results.get_text()}")
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
# Process each thread
|
| 240 |
+
for thread_elem in thread_elements:
|
| 241 |
+
if threads_processed >= max_threads:
|
| 242 |
+
break
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
# Extract thread data
|
| 246 |
+
title_elem = thread_elem.select_one('.structItem-title')
|
| 247 |
+
if not title_elem:
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
title = clean_text(title_elem.get_text())
|
| 251 |
+
thread_url = LOWYAT_BASE_URL + title_elem.find('a')['href']
|
| 252 |
+
|
| 253 |
+
# Extract author
|
| 254 |
+
author_elem = thread_elem.select_one('.structItem-minor')
|
| 255 |
+
author = clean_text(author_elem.get_text()) if author_elem else "Unknown"
|
| 256 |
+
|
| 257 |
+
# Extract date
|
| 258 |
+
date_elem = thread_elem.select_one('.structItem-startDate time')
|
| 259 |
+
date_str = date_elem.get('datetime') if date_elem else "Unknown"
|
| 260 |
+
date = extract_date(date_str)
|
| 261 |
+
|
| 262 |
+
# Extract preview text if available
|
| 263 |
+
preview_elem = thread_elem.select_one('.structItem-excerpt')
|
| 264 |
+
preview = clean_text(preview_elem.get_text()) if preview_elem else ""
|
| 265 |
+
|
| 266 |
+
# Get thread content
|
| 267 |
+
thread_data = get_thread_content(thread_url)
|
| 268 |
+
|
| 269 |
+
# Combine data
|
| 270 |
+
thread_info = {
|
| 271 |
+
"platform": "lowyat_forum",
|
| 272 |
+
"section": section,
|
| 273 |
+
"title": title,
|
| 274 |
+
"author": author,
|
| 275 |
+
"date": date,
|
| 276 |
+
"url": thread_url,
|
| 277 |
+
"preview": preview,
|
| 278 |
+
"content": thread_data.get("content", ""),
|
| 279 |
+
"replies": thread_data.get("replies", [])
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
all_threads.append(thread_info)
|
| 283 |
+
threads_processed += 1
|
| 284 |
+
print(f"[✓] Processed thread: {title} ({threads_processed}/{max_threads})")
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
print(f"[❌] Error processing thread: {str(e)}")
|
| 288 |
+
|
| 289 |
+
# Check if there are more pages
|
| 290 |
+
next_page = soup.select_one('.pageNav-jump--next')
|
| 291 |
+
if not next_page:
|
| 292 |
+
print(f"[⚠️] No more pages for keyword '{keyword}' in section '{section}'")
|
| 293 |
+
break
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
print(f"[❌] Error processing page {page}: {str(e)}")
|
| 297 |
+
break
|
| 298 |
+
|
| 299 |
+
# If no threads found and Cloudflare detected, use mock data if enabled
|
| 300 |
+
if not all_threads and cloudflare_detected and use_mock_data:
|
| 301 |
+
print(f"[ℹ️] Using mock data for Lowyat Forum due to Cloudflare protection")
|
| 302 |
+
all_threads = generate_mock_lowyat_data(keywords, sections, max_threads)
|
| 303 |
+
|
| 304 |
+
# Save results to cache
|
| 305 |
+
if use_cache:
|
| 306 |
+
try:
|
| 307 |
+
cache_data = {
|
| 308 |
+
"threads": all_threads,
|
| 309 |
+
"timestamp": datetime.now().isoformat(),
|
| 310 |
+
"keywords": keywords,
|
| 311 |
+
"sections": sections
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
with open(cache_file, 'w') as f:
|
| 315 |
+
json.dump(cache_data, f)
|
| 316 |
+
|
| 317 |
+
print(f"[💾] Saved Lowyat Forum results to cache: {cache_file}")
|
| 318 |
+
except Exception as e:
|
| 319 |
+
print(f"[⚠️] Error saving Lowyat Forum results to cache: {str(e)}")
|
| 320 |
+
|
| 321 |
+
return all_threads
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def generate_mock_lowyat_data(keywords, sections, max_threads):
|
| 325 |
+
"""
|
| 326 |
+
Generate mock data for Lowyat Forum when real data cannot be retrieved
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
keywords (list): List of keywords used for the search
|
| 330 |
+
sections (list): List of forum sections that were searched
|
| 331 |
+
max_threads (int): Maximum number of threads to generate
|
| 332 |
+
|
| 333 |
+
Returns:
|
| 334 |
+
list: List of mock thread data dictionaries
|
| 335 |
+
"""
|
| 336 |
+
print(f"[💻] Generating mock data for keywords: {', '.join(keywords)}")
|
| 337 |
+
|
| 338 |
+
# Create a list to store mock threads
|
| 339 |
+
mock_threads = []
|
| 340 |
+
|
| 341 |
+
# Define some common Malaysian usernames
|
| 342 |
+
usernames = [
|
| 343 |
+
"MalaysianGuy", "KLite", "JohorianPride", "PenangFoodie", "SarawakExplorer",
|
| 344 |
+
"MalaccaHistory", "SabahAdventure", "IPohBoy", "KuchingCat", "TerengganuDiver",
|
| 345 |
+
"PerakMan", "KedahPadi", "NegeriS9", "PahangForest", "MelakaCendol"
|
| 346 |
+
]
|
| 347 |
+
|
| 348 |
+
# Define some common topics based on keywords
|
| 349 |
+
topics_by_keyword = {
|
| 350 |
+
"cukai": [
|
| 351 |
+
"Cukai baharu akan diperkenalkan tahun depan?",
|
| 352 |
+
"Pendapat tentang cukai keuntungan modal",
|
| 353 |
+
"Cara menjimatkan cukai pendapatan",
|
| 354 |
+
"Cukai jualan dan perkhidmatan (SST) vs GST",
|
| 355 |
+
"Adakah cukai kereta import akan dikurangkan?"
|
| 356 |
+
],
|
| 357 |
+
"minyak sawit": [
|
| 358 |
+
"Harga minyak sawit dijangka naik bulan depan",
|
| 359 |
+
"EU ban minyak sawit: Kesan kepada Malaysia",
|
| 360 |
+
"Industri minyak sawit dan isu kelestarian",
|
| 361 |
+
"Minyak sawit vs minyak zaitun: Mana lebih sihat?",
|
| 362 |
+
"Eksport minyak sawit Malaysia meningkat 15%"
|
| 363 |
+
],
|
| 364 |
+
"kerajaan": [
|
| 365 |
+
"Kerajaan akan umum inisiatif baharu untuk sektor perumahan",
|
| 366 |
+
"Polisi kerajaan untuk industri teknologi",
|
| 367 |
+
"Kerajaan perkenal subsidi baharu untuk petani",
|
| 368 |
+
"Pandangan tentang prestasi kerajaan semasa",
|
| 369 |
+
"Kerajaan lancar program bantuan PKS"
|
| 370 |
+
],
|
| 371 |
+
"ekonomi": [
|
| 372 |
+
"Ekonomi Malaysia dijangka pulih pada Q3",
|
| 373 |
+
"Kesan inflasi kepada ekonomi tempatan",
|
| 374 |
+
"Ringgit vs USD: Analisis semasa",
|
| 375 |
+
"Sektor pelancongan menyumbang kepada pemulihan ekonomi",
|
| 376 |
+
"Bagaimana keadaan ekonomi mempengaruhi pasaran hartanah?"
|
| 377 |
+
]
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
# Default topics if no matching keywords
|
| 381 |
+
default_topics = [
|
| 382 |
+
"Pandangan tentang isu semasa di Malaysia",
|
| 383 |
+
"Perbincangan tentang kenaikan harga barang",
|
| 384 |
+
"Cadangan tempat makan sedap di KL",
|
| 385 |
+
"Perkongsian pengalaman kerja dari rumah",
|
| 386 |
+
"Tips melabur dalam pasaran saham Malaysia"
|
| 387 |
+
]
|
| 388 |
+
|
| 389 |
+
# Generate threads for each section
|
| 390 |
+
threads_per_section = max(1, max_threads // len(sections))
|
| 391 |
+
|
| 392 |
+
for section in sections:
|
| 393 |
+
# Find relevant topics based on keywords
|
| 394 |
+
relevant_topics = []
|
| 395 |
+
for keyword in keywords:
|
| 396 |
+
keyword_lower = keyword.lower()
|
| 397 |
+
# Check if we have predefined topics for this keyword
|
| 398 |
+
for k, topics in topics_by_keyword.items():
|
| 399 |
+
if k in keyword_lower or keyword_lower in k:
|
| 400 |
+
relevant_topics.extend(topics)
|
| 401 |
+
|
| 402 |
+
# If no relevant topics found, use default topics
|
| 403 |
+
if not relevant_topics:
|
| 404 |
+
relevant_topics = default_topics
|
| 405 |
+
|
| 406 |
+
# Generate threads for this section
|
| 407 |
+
for i in range(threads_per_section):
|
| 408 |
+
if len(mock_threads) >= max_threads:
|
| 409 |
+
break
|
| 410 |
+
|
| 411 |
+
# Select a topic
|
| 412 |
+
topic = random.choice(relevant_topics)
|
| 413 |
+
|
| 414 |
+
# Generate a date within the last month
|
| 415 |
+
days_ago = random.randint(1, 30)
|
| 416 |
+
thread_date = (datetime.now() - timedelta(days=days_ago)).isoformat()
|
| 417 |
+
|
| 418 |
+
# Generate content
|
| 419 |
+
content = f"Ini adalah perbincangan tentang {topic}. "
|
| 420 |
+
content += f"Saya ingin berkongsi pendapat dan mendapatkan maklum balas daripada ahli forum. "
|
| 421 |
+
content += f"Apakah pandangan anda tentang perkara ini?"
|
| 422 |
+
|
| 423 |
+
# Generate replies
|
| 424 |
+
num_replies = random.randint(1, 5)
|
| 425 |
+
replies = []
|
| 426 |
+
|
| 427 |
+
for j in range(num_replies):
|
| 428 |
+
reply_days_ago = random.randint(0, days_ago)
|
| 429 |
+
reply_date = (datetime.now() - timedelta(days=reply_days_ago)).isoformat()
|
| 430 |
+
|
| 431 |
+
reply_username = random.choice(usernames)
|
| 432 |
+
reply_content = f"Saya bersetuju dengan pendapat anda tentang {topic}. "
|
| 433 |
+
reply_content += f"Ini adalah pandangan saya..."
|
| 434 |
+
|
| 435 |
+
replies.append({
|
| 436 |
+
"author": reply_username,
|
| 437 |
+
"date": reply_date,
|
| 438 |
+
"content": reply_content
|
| 439 |
+
})
|
| 440 |
+
|
| 441 |
+
# Create thread info
|
| 442 |
+
thread_info = {
|
| 443 |
+
"platform": "lowyat_forum",
|
| 444 |
+
"section": section,
|
| 445 |
+
"title": topic,
|
| 446 |
+
"author": random.choice(usernames),
|
| 447 |
+
"date": thread_date,
|
| 448 |
+
"url": f"https://forum.lowyat.net/topic/{random.randint(100000, 999999)}",
|
| 449 |
+
"preview": content[:100] + "...",
|
| 450 |
+
"content": content,
|
| 451 |
+
"replies": replies
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
mock_threads.append(thread_info)
|
| 455 |
+
print(f"[💻] Generated mock thread: {topic} in {section}")
|
| 456 |
+
|
| 457 |
+
return mock_threads
|
| 458 |
+
|
| 459 |
+
def get_thread_content(thread_url, max_posts=10):
|
| 460 |
+
"""
|
| 461 |
+
Get content from a Lowyat Forum thread
|
| 462 |
+
|
| 463 |
+
Args:
|
| 464 |
+
thread_url (str): URL of the thread
|
| 465 |
+
max_posts (int): Maximum number of posts to extract
|
| 466 |
+
|
| 467 |
+
Returns:
|
| 468 |
+
dict: Thread content and replies
|
| 469 |
+
"""
|
| 470 |
+
try:
|
| 471 |
+
# Add random delay to avoid rate limiting
|
| 472 |
+
time.sleep(random.uniform(1, 3))
|
| 473 |
+
|
| 474 |
+
# Get thread page
|
| 475 |
+
response = requests.get(thread_url, headers={
|
| 476 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 477 |
+
})
|
| 478 |
+
|
| 479 |
+
if response.status_code != 200:
|
| 480 |
+
print(f"[❌] Failed to get thread page: {response.status_code}")
|
| 481 |
+
return {"content": "", "replies": []}
|
| 482 |
+
|
| 483 |
+
# Parse thread page
|
| 484 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 485 |
+
|
| 486 |
+
# Get main post content
|
| 487 |
+
main_post = soup.select_one('.message--post')
|
| 488 |
+
content = ""
|
| 489 |
+
if main_post:
|
| 490 |
+
content_elem = main_post.select_one('.message-body .bbWrapper')
|
| 491 |
+
content = clean_text(content_elem.get_text()) if content_elem else ""
|
| 492 |
+
|
| 493 |
+
# Get replies
|
| 494 |
+
replies = []
|
| 495 |
+
reply_elements = soup.select('.message--post')[1:max_posts+1] # Skip the first post (main content)
|
| 496 |
+
|
| 497 |
+
for reply_elem in reply_elements:
|
| 498 |
+
try:
|
| 499 |
+
# Extract reply author
|
| 500 |
+
author_elem = reply_elem.select_one('.message-name')
|
| 501 |
+
author = clean_text(author_elem.get_text()) if author_elem else "Unknown"
|
| 502 |
+
|
| 503 |
+
# Extract reply date
|
| 504 |
+
date_elem = reply_elem.select_one('.message-attribution-main time')
|
| 505 |
+
date_str = date_elem.get('datetime') if date_elem else "Unknown"
|
| 506 |
+
date = extract_date(date_str)
|
| 507 |
+
|
| 508 |
+
# Extract reply content
|
| 509 |
+
content_elem = reply_elem.select_one('.message-body .bbWrapper')
|
| 510 |
+
reply_content = clean_text(content_elem.get_text()) if content_elem else ""
|
| 511 |
+
|
| 512 |
+
replies.append({
|
| 513 |
+
"author": author,
|
| 514 |
+
"date": date,
|
| 515 |
+
"content": reply_content
|
| 516 |
+
})
|
| 517 |
+
except Exception as e:
|
| 518 |
+
print(f"[❌] Error processing reply: {str(e)}")
|
| 519 |
+
|
| 520 |
+
return {
|
| 521 |
+
"content": content,
|
| 522 |
+
"replies": replies
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
except Exception as e:
|
| 526 |
+
print(f"[❌] Error getting thread content: {str(e)}")
|
| 527 |
+
return {"content": "", "replies": []}
|
| 528 |
+
|
| 529 |
+
def convert_to_dataframe(threads):
|
| 530 |
+
"""
|
| 531 |
+
Convert Lowyat Forum thread data to a DataFrame compatible with the claim analysis system
|
| 532 |
+
|
| 533 |
+
Args:
|
| 534 |
+
threads (list): List of thread data dictionaries
|
| 535 |
+
|
| 536 |
+
Returns:
|
| 537 |
+
pandas.DataFrame: DataFrame with standardized columns
|
| 538 |
+
"""
|
| 539 |
+
records = []
|
| 540 |
+
|
| 541 |
+
for thread in threads:
|
| 542 |
+
# Add the main thread as a record
|
| 543 |
+
main_record = {
|
| 544 |
+
"platform": "LowyatForum", # Changed to standardized label
|
| 545 |
+
"date": thread.get("date", ""),
|
| 546 |
+
"username": thread.get("author", ""),
|
| 547 |
+
"post_text": thread.get("title", "") + " " + thread.get("content", ""),
|
| 548 |
+
"post_url": thread.get("url", ""),
|
| 549 |
+
"likes": 0, # Lowyat doesn't expose like counts in the HTML
|
| 550 |
+
"shares": 0, # No share counts
|
| 551 |
+
"comments_count": len(thread.get("replies", [])),
|
| 552 |
+
"comment_text": "",
|
| 553 |
+
"combined_text": thread.get("title", "") + " " + thread.get("content", "")
|
| 554 |
+
}
|
| 555 |
+
records.append(main_record)
|
| 556 |
+
|
| 557 |
+
# Add each reply as a separate record
|
| 558 |
+
for reply in thread.get("replies", []):
|
| 559 |
+
reply_record = {
|
| 560 |
+
"platform": "LowyatForum_Comment", # Changed to standardized label
|
| 561 |
+
"date": reply.get("date", ""),
|
| 562 |
+
"username": reply.get("author", ""),
|
| 563 |
+
"post_text": "",
|
| 564 |
+
"post_url": thread.get("url", ""),
|
| 565 |
+
"likes": 0,
|
| 566 |
+
"shares": 0,
|
| 567 |
+
"comments_count": 0,
|
| 568 |
+
"comment_text": reply.get("content", ""),
|
| 569 |
+
"combined_text": reply.get("content", "")
|
| 570 |
+
}
|
| 571 |
+
records.append(reply_record)
|
| 572 |
+
|
| 573 |
+
# Create DataFrame
|
| 574 |
+
if records:
|
| 575 |
+
df = pd.DataFrame(records)
|
| 576 |
+
return df
|
| 577 |
+
else:
|
| 578 |
+
# Return empty DataFrame with correct columns
|
| 579 |
+
return pd.DataFrame(columns=[
|
| 580 |
+
"platform", "date", "username", "post_text", "post_url",
|
| 581 |
+
"likes", "shares", "comments_count", "comment_text", "combined_text"
|
| 582 |
+
])
|
| 583 |
+
|
| 584 |
+
def run(keywords, sections=None, max_threads=20, output_path=None, full_claim=None, verbose=True, use_mock_data=True):
|
| 585 |
+
"""
|
| 586 |
+
Run the Lowyat Forum crawler and save results
|
| 587 |
+
|
| 588 |
+
Args:
|
| 589 |
+
keywords (list): List of keywords to search for
|
| 590 |
+
sections (list): List of forum sections to search in
|
| 591 |
+
max_threads (int): Maximum number of threads to process
|
| 592 |
+
output_path (str): Path to save results CSV
|
| 593 |
+
full_claim (str): The full claim text for more targeted searching
|
| 594 |
+
verbose (bool): Whether to print verbose output
|
| 595 |
+
use_mock_data (bool): Whether to use mock data if real data cannot be retrieved
|
| 596 |
+
|
| 597 |
+
Returns:
|
| 598 |
+
pandas.DataFrame: DataFrame with crawled data
|
| 599 |
+
"""
|
| 600 |
+
print(f"[🔍] Starting Lowyat Forum crawler for keywords: {', '.join(keywords)}")
|
| 601 |
+
|
| 602 |
+
# Check if this is a crime-related claim about Kelantan
|
| 603 |
+
crime_related = any(kw in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"] for kw in keywords)
|
| 604 |
+
kelantan_related = any("kelantan" in kw.lower() for kw in keywords)
|
| 605 |
+
|
| 606 |
+
# Use the full claim directly if available for crime-related claims in Kelantan
|
| 607 |
+
if full_claim and crime_related and kelantan_related:
|
| 608 |
+
print(f"[🔍] Using full claim for Lowyat Forum search: {full_claim}")
|
| 609 |
+
|
| 610 |
+
# Use the full claim as a single search term
|
| 611 |
+
keywords = [full_claim]
|
| 612 |
+
|
| 613 |
+
# Also add these specialized keywords for better coverage
|
| 614 |
+
specialized_keywords = [
|
| 615 |
+
"polis kelantan",
|
| 616 |
+
"kes rogol kelantan",
|
| 617 |
+
"sumbang mahram",
|
| 618 |
+
"jenayah seksual"
|
| 619 |
+
]
|
| 620 |
+
|
| 621 |
+
# Add specialized keywords to the search
|
| 622 |
+
keywords.extend(specialized_keywords)
|
| 623 |
+
print(f"[🔍] Using keywords: {', '.join(keywords)}")
|
| 624 |
+
# Use more targeted keywords for crime-related claims in Kelantan (if no full claim)
|
| 625 |
+
elif crime_related and kelantan_related:
|
| 626 |
+
print("[🔍] Detected crime-related claim about Kelantan, using specialized keywords")
|
| 627 |
+
keywords = [
|
| 628 |
+
"polis kelantan",
|
| 629 |
+
"kes rogol kelantan",
|
| 630 |
+
"sumbang mahram",
|
| 631 |
+
"jenayah seksual"
|
| 632 |
+
]
|
| 633 |
+
# Add context-specific keywords for other types of claims
|
| 634 |
+
elif full_claim:
|
| 635 |
+
# Check for economic/financial claims
|
| 636 |
+
if any(term in full_claim.lower() for term in ["ekonomi", "kewangan", "cukai", "subsidi", "harga"]):
|
| 637 |
+
print("[🔍] Detected economic/financial claim, adding relevant keywords")
|
| 638 |
+
econ_keywords = ["ekonomi malaysia", "kewangan", "cukai", "subsidi", "harga"]
|
| 639 |
+
keywords.extend([k for k in econ_keywords if k not in keywords])
|
| 640 |
+
|
| 641 |
+
# Check for political claims
|
| 642 |
+
elif any(term in full_claim.lower() for term in ["kerajaan", "politik", "perdana menteri", "kabinet", "parlimen"]):
|
| 643 |
+
print("[🔍] Detected political claim, adding relevant keywords")
|
| 644 |
+
pol_keywords = ["kerajaan", "politik malaysia", "dasar", "kabinet"]
|
| 645 |
+
keywords.extend([k for k in pol_keywords if k not in keywords])
|
| 646 |
+
|
| 647 |
+
# Set default sections if not provided
|
| 648 |
+
if sections is None:
|
| 649 |
+
sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
|
| 650 |
+
|
| 651 |
+
# Validate sections against available forum sections
|
| 652 |
+
valid_sections = [section for section in sections if section in FORUM_SECTIONS]
|
| 653 |
+
if not valid_sections:
|
| 654 |
+
print("[⚠️] No valid forum sections provided. Using default sections.")
|
| 655 |
+
valid_sections = ["Kopitiam", "SeriousKopitiam", "Finance"]
|
| 656 |
+
|
| 657 |
+
# If sections were invalid, inform the user
|
| 658 |
+
if len(valid_sections) != len(sections):
|
| 659 |
+
print(f"[⚠️] Some sections were invalid. Using: {', '.join(valid_sections)}")
|
| 660 |
+
|
| 661 |
+
# For crime-related topics, prioritize SeriousKopitiam
|
| 662 |
+
if crime_related and "SeriousKopitiam" in valid_sections:
|
| 663 |
+
# Move SeriousKopitiam to the front of the list
|
| 664 |
+
valid_sections.remove("SeriousKopitiam")
|
| 665 |
+
valid_sections.insert(0, "SeriousKopitiam")
|
| 666 |
+
|
| 667 |
+
# For economic topics, prioritize Finance
|
| 668 |
+
elif any(term in "".join(keywords).lower() for term in ["ekonomi", "kewangan", "cukai", "subsidi", "harga"]) and "Finance" in valid_sections:
|
| 669 |
+
valid_sections.remove("Finance")
|
| 670 |
+
valid_sections.insert(0, "Finance")
|
| 671 |
+
|
| 672 |
+
# For political topics, prioritize Politics
|
| 673 |
+
elif any(term in "".join(keywords).lower() for term in ["kerajaan", "politik", "perdana menteri", "kabinet", "parlimen"]) and "Politics" in valid_sections:
|
| 674 |
+
valid_sections.remove("Politics")
|
| 675 |
+
valid_sections.insert(0, "Politics")
|
| 676 |
+
|
| 677 |
+
# Search forum with enhanced options
|
| 678 |
+
threads = search_lowyat_forum(
|
| 679 |
+
keywords,
|
| 680 |
+
sections=valid_sections,
|
| 681 |
+
max_threads=max_threads,
|
| 682 |
+
verbose=verbose,
|
| 683 |
+
use_mock_data=use_mock_data
|
| 684 |
+
)
|
| 685 |
+
print(f"[✓] Found {len(threads)} threads on Lowyat Forum")
|
| 686 |
+
|
| 687 |
+
# Convert to DataFrame
|
| 688 |
+
df = convert_to_dataframe(threads)
|
| 689 |
+
print(f"[✓] Converted to {len(df)} records")
|
| 690 |
+
|
| 691 |
+
# Save to CSV if output path provided
|
| 692 |
+
if output_path and len(df) > 0:
|
| 693 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 694 |
+
df.to_csv(output_path, index=False)
|
| 695 |
+
print(f"[💾] Saved Lowyat Forum data to {output_path}")
|
| 696 |
+
elif output_path:
|
| 697 |
+
# Create an empty CSV file with the correct columns
|
| 698 |
+
empty_df = pd.DataFrame(columns=[
|
| 699 |
+
"platform", "date", "username", "post_text", "post_url",
|
| 700 |
+
"likes", "shares", "comments_count", "comment_text", "combined_text"
|
| 701 |
+
])
|
| 702 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 703 |
+
empty_df.to_csv(output_path, index=False)
|
| 704 |
+
print(f"[💾] Saved empty Lowyat Forum data file to {output_path}")
|
| 705 |
+
|
| 706 |
+
return df
|
| 707 |
+
|
| 708 |
+
# Test the crawler if run directly
|
| 709 |
+
if __name__ == "__main__":
|
| 710 |
+
test_keywords = ["cukai minyak sawit", "palm oil tax"]
|
| 711 |
+
test_sections = ["Kopitiam", "Finance"]
|
| 712 |
+
|
| 713 |
+
df = run_lowyat_crawler(test_keywords, sections=test_sections, max_threads=10)
|
| 714 |
+
print(df.head())
|
ai_api/library/priority_indexer.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# priority_indexer.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
def load_agency_keywords(filepath=None):
|
| 9 |
+
"""
|
| 10 |
+
Load keywords for agency detection or use default keywords if file not found
|
| 11 |
+
"""
|
| 12 |
+
# Define default agency keywords if file not provided or not found
|
| 13 |
+
default_keywords = {
|
| 14 |
+
# Government-related keywords
|
| 15 |
+
"government": [
|
| 16 |
+
"kerajaan", "menteri", "perdana menteri", "kementerian", "jabatan",
|
| 17 |
+
"agensi", "dasar", "parlimen", "dewan rakyat", "dewan negara",
|
| 18 |
+
"dun", "pejabat", "keselamatan negara", "atm", "polis",
|
| 19 |
+
"kdn", "hasil", "sop", "ancaman", "pentadbiran", "kabinet",
|
| 20 |
+
"politik", "ahli parlimen", "wakil rakyat", "adun", "pemimpin",
|
| 21 |
+
"ketua menteri", "menteri besar", "exco", "majlis", "pihak berkuasa",
|
| 22 |
+
"pbt", "majlis perbandaran", "majlis bandaraya", "dewan bandaraya"
|
| 23 |
+
],
|
| 24 |
+
|
| 25 |
+
# Economic keywords
|
| 26 |
+
"economic": [
|
| 27 |
+
"ekonomi", "kewangan", "bank", "cukai", "subsidi", "harga", "kos",
|
| 28 |
+
"perbelanjaan", "pendapatan", "gaji", "dividen", "saham", "pasaran",
|
| 29 |
+
"inflasi", "deflasi", "krisis", "kemelesetan", "pertumbuhan", "gdp",
|
| 30 |
+
"kdnk", "pelaburan", "pelabur", "perniagaan", "syarikat", "industri",
|
| 31 |
+
"sektor", "perdagangan", "import", "eksport", "mata wang", "ringgit",
|
| 32 |
+
"dolar", "hutang", "pinjaman", "faedah", "untung", "rugi", "bayaran",
|
| 33 |
+
"fi", "yuran", "perbelanjaan", "pendapatan", "bonus", "elaun",
|
| 34 |
+
"insentif", "bantuan", "sumbangan", "derma", "zakat", "duti",
|
| 35 |
+
"levi", "caj", "jualan", "belian", "pembelian", "perolehan",
|
| 36 |
+
"tender", "kontrak", "projek", "pembangunan", "infrastruktur",
|
| 37 |
+
"pembinaan", "hartanah", "rumah", "kediaman", "komersial",
|
| 38 |
+
"tanah", "saiz", "keluasan", "murah", "mahal", "berpatutan",
|
| 39 |
+
"mampu", "tidak mampu", "bekalan", "stok", "inventori",
|
| 40 |
+
"simpanan", "rizab", "aset", "liabiliti", "kredit", "debit",
|
| 41 |
+
"ansuran", "keuntungan", "kerugian", "defisit", "surplus",
|
| 42 |
+
"lebihan", "kekurangan", "kenaikan", "penurunan", "peningkatan",
|
| 43 |
+
"pengurangan", "pemulihan", "pembaikan"
|
| 44 |
+
],
|
| 45 |
+
|
| 46 |
+
# Law-related keywords
|
| 47 |
+
"law": [
|
| 48 |
+
"undang-undang", "perundangan", "akta", "enakmen", "ordinan",
|
| 49 |
+
"peraturan", "perlembagaan", "mahkamah", "hakim", "peguam",
|
| 50 |
+
"pendakwa", "pendakwaan", "pertuduhan", "dakwaan", "saman",
|
| 51 |
+
"waran", "tangkap", "tahan", "reman", "jamin", "ikat jamin",
|
| 52 |
+
"denda", "hukuman", "penjara", "polis", "balai", "laporan",
|
| 53 |
+
"aduan", "siasatan", "siasat", "jenayah", "sivil", "kes",
|
| 54 |
+
"fail", "bicara", "perbicaraan", "prosiding", "rayuan",
|
| 55 |
+
"petisyen", "pindaan", "bon", "jaminan", "saksi", "keterangan",
|
| 56 |
+
"bukti", "forensik", "peguambela", "peguamcara", "pendakwa raya",
|
| 57 |
+
"majistret", "ketua hakim", "ketua hakim negara", "hakim besar",
|
| 58 |
+
"mahkamah tinggi", "mahkamah rayuan", "mahkamah persekutuan",
|
| 59 |
+
"mahkamah rendah", "mahkamah majistret", "mahkamah sesyen",
|
| 60 |
+
"mahkamah syariah", "pdrm", "ibu pejabat polis", "ketua polis",
|
| 61 |
+
"pegawai polis", "anggota polis", "konstabel", "koperal",
|
| 62 |
+
"sarjan", "inspektor", "superintendan", "komisioner", "sprm",
|
| 63 |
+
"suruhanjaya pencegahan rasuah", "rasuah", "korupsi",
|
| 64 |
+
"salah guna kuasa", "penyelewengan", "pecah amanah",
|
| 65 |
+
"pengubahan wang haram"
|
| 66 |
+
],
|
| 67 |
+
|
| 68 |
+
# Danger-related keywords
|
| 69 |
+
"danger": [
|
| 70 |
+
"bahaya", "merbahaya", "risiko", "ancaman", "bencana", "malapetaka",
|
| 71 |
+
"tragedi", "musibah", "kemalangan", "nahas", "kecelakaan", "kecederaan",
|
| 72 |
+
"kematian", "korban", "mangsa", "kemusnahan", "kerosakan", "kerugian",
|
| 73 |
+
"kehilangan", "kecurian", "rompakan", "samun", "ragut", "pecah",
|
| 74 |
+
"pecah rumah", "pecah masuk", "curi", "culik", "bunuh", "bunuh diri",
|
| 75 |
+
"mati", "cedera", "parah", "kritikal", "koma", "luka", "patah",
|
| 76 |
+
"retak", "lebam", "bengkak", "darah", "pendarahan", "kecemasan",
|
| 77 |
+
"ambulans", "hospital", "klinik", "doktor", "ubat", "dadah",
|
| 78 |
+
"narkotik", "ganja", "heroin", "kokain", "syabu", "pil kuda",
|
| 79 |
+
"ekstasi", "ketamin", "morfin", "ketagihan", "penagih", "pengedar",
|
| 80 |
+
"sindiket", "kartel", "mafia", "gangster", "kongsi gelap", "geng",
|
| 81 |
+
"kumpulan jenayah", "penjenayah", "penjahat", "pesalah", "banduan",
|
| 82 |
+
"tahanan", "suspek", "tertuduh", "terdakwa", "senjata", "pistol",
|
| 83 |
+
"revolver", "senapang", "rifle", "shotgun", "bom", "granat",
|
| 84 |
+
"peluru", "kelongsong", "senjata api", "senjata tajam", "pisau",
|
| 85 |
+
"parang", "kapak", "keris", "pedang", "racun", "toksin", "kimia",
|
| 86 |
+
"biologi", "nuklear", "radiasi", "sinaran", "letupan", "ledakan",
|
| 87 |
+
"kebakaran", "api", "nyalaan", "bara", "asap", "hangus", "terbakar",
|
| 88 |
+
"banjir", "bah", "limpahan", "hujan", "ribut", "taufan", "siklon",
|
| 89 |
+
"hurikan", "tornado", "puting beliung", "angin kencang", "kilat",
|
| 90 |
+
"petir", "guruh", "guntur", "halilintar", "tanah runtuh", "gelinciran tanah",
|
| 91 |
+
"runtuhan", "runtuh", "jatuh", "roboh", "rebah", "tumbang", "gempa",
|
| 92 |
+
"gempa bumi", "tsunami", "ombak besar", "gelombang tinggi", "kemarau",
|
| 93 |
+
"kekeringan", "perang", "pertempuran", "pergaduhan", "perkelahian",
|
| 94 |
+
"rusuhan", "kekacauan", "huru-hara", "keganasan", "kekerasan",
|
| 95 |
+
"keselamatan", "keselamatan negara", "keselamatan awam", "kanser",
|
| 96 |
+
"barah", "tumor", "penyakit", "wabak", "epidemik", "pandemik",
|
| 97 |
+
"jangkitan", "virus", "bakteria", "nyawa", "terancam", "maut"
|
| 98 |
+
]
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# Try to load from file if provided
|
| 102 |
+
if filepath and os.path.exists(filepath):
|
| 103 |
+
try:
|
| 104 |
+
df = pd.read_csv(filepath)
|
| 105 |
+
if 'keyword' in df.columns and 'category' in df.columns:
|
| 106 |
+
# Group keywords by category
|
| 107 |
+
keywords = {}
|
| 108 |
+
for category in df['category'].unique():
|
| 109 |
+
keywords[category] = df[df['category'] == category]['keyword'].tolist()
|
| 110 |
+
return keywords
|
| 111 |
+
else:
|
| 112 |
+
print(f"[⚠️] Warning: Required columns not found in {filepath}. Using default keywords.")
|
| 113 |
+
return default_keywords
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"[⚠️] Error loading agency keywords from {filepath}: {e}")
|
| 116 |
+
return default_keywords
|
| 117 |
+
else:
|
| 118 |
+
if filepath:
|
| 119 |
+
print(f"[ℹ️] Agency keywords file not found. Using default keywords.")
|
| 120 |
+
return default_keywords
|
| 121 |
+
|
| 122 |
+
def analyze_text_content(df, keywords_dict):
|
| 123 |
+
"""
|
| 124 |
+
Analyze text content in the dataframe to find keywords
|
| 125 |
+
Returns a dictionary of found keywords by category
|
| 126 |
+
"""
|
| 127 |
+
found_keywords = {category: [] for category in keywords_dict.keys()}
|
| 128 |
+
|
| 129 |
+
# Combine all text columns
|
| 130 |
+
text_columns = ['post_text', 'comment_text', 'title', 'snippet', 'combined_text']
|
| 131 |
+
all_text = ""
|
| 132 |
+
|
| 133 |
+
for col in text_columns:
|
| 134 |
+
if col in df.columns:
|
| 135 |
+
all_text += " " + " ".join(df[col].fillna("").astype(str))
|
| 136 |
+
|
| 137 |
+
all_text = all_text.lower()
|
| 138 |
+
|
| 139 |
+
# Search for keywords in the combined text
|
| 140 |
+
for category, keywords in keywords_dict.items():
|
| 141 |
+
for keyword in keywords:
|
| 142 |
+
if keyword.lower() in all_text:
|
| 143 |
+
found_keywords[category].append(keyword)
|
| 144 |
+
|
| 145 |
+
# Remove duplicates and limit to top 5 per category
|
| 146 |
+
for category in found_keywords:
|
| 147 |
+
found_keywords[category] = list(set(found_keywords[category]))[:5]
|
| 148 |
+
|
| 149 |
+
return found_keywords
|
| 150 |
+
|
| 151 |
+
def calculate_priority_score(flags):
|
| 152 |
+
"""Calculate priority score based on flags"""
|
| 153 |
+
# Base weights for different flags
|
| 154 |
+
weights = {
|
| 155 |
+
"fact_check_value": 1.0,
|
| 156 |
+
"cause_confusion": 1.5,
|
| 157 |
+
"cause_chaos": 1.8,
|
| 158 |
+
"affects_government": 1.0,
|
| 159 |
+
"economic_impact": 0.8,
|
| 160 |
+
"law_related": 0.8,
|
| 161 |
+
"public_interest": 1.2,
|
| 162 |
+
"lives_in_danger": 1.5,
|
| 163 |
+
"viral": 1.0,
|
| 164 |
+
"urgent": 2.0
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
# Calculate weighted score
|
| 168 |
+
score = 0
|
| 169 |
+
for flag, value in flags.items():
|
| 170 |
+
if flag in weights and value == 1:
|
| 171 |
+
score += weights[flag]
|
| 172 |
+
|
| 173 |
+
# Normalize to 0-10 scale
|
| 174 |
+
max_possible_score = sum(weights.values())
|
| 175 |
+
normalized_score = (score / max_possible_score) * 10
|
| 176 |
+
|
| 177 |
+
# Cap at 10
|
| 178 |
+
return min(normalized_score, 10.0)
|
| 179 |
+
|
| 180 |
+
def get_priority_level(score):
|
| 181 |
+
"""Get priority level based on score"""
|
| 182 |
+
if score >= 8.0:
|
| 183 |
+
return "TINGGI"
|
| 184 |
+
elif score >= 5.0:
|
| 185 |
+
return "SEDERHANA"
|
| 186 |
+
else:
|
| 187 |
+
return "RENDAH"
|
| 188 |
+
|
| 189 |
+
def run(sentiment_csv, agencies_csv=None, output_path=None, claim=None, claim_id=None, keywords=None):
|
| 190 |
+
"""
|
| 191 |
+
Run priority indexing on sentiment data
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
sentiment_csv (str): Path to sentiment CSV file
|
| 195 |
+
agencies_csv (str, optional): Path to agencies CSV file
|
| 196 |
+
output_path (str, optional): Path to output JSON file
|
| 197 |
+
claim (str, optional): The claim text
|
| 198 |
+
claim_id (str, optional): Unique identifier for the claim
|
| 199 |
+
keywords (list, optional): List of keywords
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
dict: Priority report data
|
| 203 |
+
"""
|
| 204 |
+
print(f"[🔍] Loading sentiment data from: {sentiment_csv}")
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
df = pd.read_csv(sentiment_csv)
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"[❌] Error reading sentiment data: {e}")
|
| 210 |
+
return None
|
| 211 |
+
|
| 212 |
+
# Load agency keywords
|
| 213 |
+
agency_keywords = load_agency_keywords(agencies_csv)
|
| 214 |
+
|
| 215 |
+
# Initialize flags
|
| 216 |
+
flags = {
|
| 217 |
+
"fact_check_value": 0,
|
| 218 |
+
"cause_confusion": 0,
|
| 219 |
+
"cause_chaos": 0,
|
| 220 |
+
"affects_government": 0,
|
| 221 |
+
"economic_impact": 0,
|
| 222 |
+
"law_related": 0,
|
| 223 |
+
"public_interest": 0,
|
| 224 |
+
"lives_in_danger": 0,
|
| 225 |
+
"viral": 0,
|
| 226 |
+
"urgent": 0
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# Calculate sentiment counts
|
| 230 |
+
sentiment_counts = df['sentiment'].value_counts().to_dict()
|
| 231 |
+
|
| 232 |
+
# Convert numeric sentiments to text
|
| 233 |
+
sentiment_map = {0: "neutral", 1: "positive", 2: "negative"}
|
| 234 |
+
text_counts = {}
|
| 235 |
+
|
| 236 |
+
for k, v in sentiment_counts.items():
|
| 237 |
+
if k in sentiment_map:
|
| 238 |
+
text_counts[sentiment_map[k]] = v
|
| 239 |
+
else:
|
| 240 |
+
text_counts[k] = v
|
| 241 |
+
|
| 242 |
+
# Get total records
|
| 243 |
+
total_records = len(df)
|
| 244 |
+
|
| 245 |
+
# Calculate engagement metrics
|
| 246 |
+
total_likes = df['likes'].sum() if 'likes' in df.columns else 0
|
| 247 |
+
total_shares = df['shares'].sum() if 'shares' in df.columns else 0
|
| 248 |
+
total_comments = df['comments'].sum() if 'comments' in df.columns else 0
|
| 249 |
+
total_views = df['views'].sum() if 'views' in df.columns else 0
|
| 250 |
+
|
| 251 |
+
total_engagement = total_likes + total_shares + total_comments + total_views
|
| 252 |
+
|
| 253 |
+
# Check fact_check_value flag (based on engagement)
|
| 254 |
+
# Rule: High engagement indicates need for fact checking
|
| 255 |
+
if total_engagement > 10000:
|
| 256 |
+
flags["fact_check_value"] = 1
|
| 257 |
+
print(f"[📊] Flag: fact_check_value triggered (Total engagement: {total_engagement})")
|
| 258 |
+
|
| 259 |
+
# Check sentiment-based flags
|
| 260 |
+
pos = text_counts.get("positive", 0)
|
| 261 |
+
neg = text_counts.get("negative", 0)
|
| 262 |
+
neu = text_counts.get("neutral", 0)
|
| 263 |
+
|
| 264 |
+
total_sentiment = pos + neg + neu
|
| 265 |
+
if total_sentiment > 0:
|
| 266 |
+
pos_ratio = pos / total_sentiment
|
| 267 |
+
neg_ratio = neg / total_sentiment
|
| 268 |
+
neu_ratio = neu / total_sentiment
|
| 269 |
+
|
| 270 |
+
# Rule: cause_confusion if positive = negative OR neutral is high
|
| 271 |
+
if (abs(pos_ratio - neg_ratio) < 0.2 and pos_ratio > 0.2 and neg_ratio > 0.2) or (neu_ratio > 0.7):
|
| 272 |
+
flags["cause_confusion"] = 1
|
| 273 |
+
print(f"[📊] Flag: cause_confusion triggered (Pos: {pos_ratio:.2f}, Neg: {neg_ratio:.2f}, Neu: {neu_ratio:.2f})")
|
| 274 |
+
|
| 275 |
+
# Rule: cause_chaos if negative sentiment is high
|
| 276 |
+
if neg_ratio > 0.4:
|
| 277 |
+
flags["cause_chaos"] = 1
|
| 278 |
+
print(f"[📊] Flag: cause_chaos triggered (Negative: {neg_ratio:.2f})")
|
| 279 |
+
|
| 280 |
+
# Analyze text content for keywords
|
| 281 |
+
found_keywords = analyze_text_content(df, agency_keywords)
|
| 282 |
+
|
| 283 |
+
# Check government-related flag
|
| 284 |
+
# Rule: Contains government-related keywords
|
| 285 |
+
if found_keywords["government"]:
|
| 286 |
+
flags["affects_government"] = 1
|
| 287 |
+
print(f"[📊] Flag: affects_government triggered (Gov terms: {', '.join(found_keywords['government'])})")
|
| 288 |
+
|
| 289 |
+
# Check economic impact flag
|
| 290 |
+
# Rule: Contains economic-related keywords
|
| 291 |
+
if found_keywords["economic"]:
|
| 292 |
+
flags["economic_impact"] = 1
|
| 293 |
+
print(f"[📊] Flag: economic_impact triggered (Economic terms: {', '.join(found_keywords['economic'])})")
|
| 294 |
+
|
| 295 |
+
# Check law-related flag
|
| 296 |
+
# Rule: Contains law-related keywords
|
| 297 |
+
if found_keywords["law"]:
|
| 298 |
+
flags["law_related"] = 1
|
| 299 |
+
print(f"[📊] Flag: law_related triggered (Law terms: {', '.join(found_keywords['law'])})")
|
| 300 |
+
|
| 301 |
+
# Check public interest flag
|
| 302 |
+
# Rule: High comments and shares indicate public interest
|
| 303 |
+
if (total_comments + total_shares) > 1000:
|
| 304 |
+
flags["public_interest"] = 1
|
| 305 |
+
print(f"[📊] Flag: public_interest triggered (Comments + Shares: {total_comments + total_shares})")
|
| 306 |
+
|
| 307 |
+
# Check danger-related flag
|
| 308 |
+
# Rule: Contains danger-related keywords
|
| 309 |
+
if found_keywords["danger"]:
|
| 310 |
+
flags["lives_in_danger"] = 1
|
| 311 |
+
print(f"[📊] Flag: lives_in_danger triggered (Danger terms: {', '.join(found_keywords['danger'])})")
|
| 312 |
+
|
| 313 |
+
# Check viral flag
|
| 314 |
+
# Rule: High shares indicate virality
|
| 315 |
+
if total_shares > 1000:
|
| 316 |
+
flags["viral"] = 1
|
| 317 |
+
print(f"[📊] Flag: viral triggered (Total shares: {total_shares})")
|
| 318 |
+
|
| 319 |
+
# Check urgent flag
|
| 320 |
+
# Rule: If 5 or more flags are triggered, it's urgent
|
| 321 |
+
flags_triggered = sum(flags.values())
|
| 322 |
+
if flags_triggered >= 5:
|
| 323 |
+
flags["urgent"] = 1
|
| 324 |
+
print(f"[📊] Flag: urgent triggered ({flags_triggered} flags triggered)")
|
| 325 |
+
|
| 326 |
+
# Calculate priority score
|
| 327 |
+
priority_score = calculate_priority_score(flags)
|
| 328 |
+
priority_level = get_priority_level(priority_score)
|
| 329 |
+
|
| 330 |
+
# Prepare report data
|
| 331 |
+
report_data = {
|
| 332 |
+
"priority_flags": flags,
|
| 333 |
+
"priority_score": priority_score,
|
| 334 |
+
"priority_level": priority_level,
|
| 335 |
+
"sentiment_counts": text_counts,
|
| 336 |
+
"total_records": total_records,
|
| 337 |
+
"engagement": {
|
| 338 |
+
"likes": int(total_likes),
|
| 339 |
+
"shares": int(total_shares),
|
| 340 |
+
"comments": int(total_comments),
|
| 341 |
+
"views": int(total_views),
|
| 342 |
+
"total": int(total_engagement)
|
| 343 |
+
},
|
| 344 |
+
"found_keywords": found_keywords,
|
| 345 |
+
"claim": claim,
|
| 346 |
+
"keywords": keywords,
|
| 347 |
+
"timestamp": datetime.now().isoformat()
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Ensure output directory exists
|
| 351 |
+
if not output_path:
|
| 352 |
+
output_path = os.path.join("reports", os.path.basename(sentiment_csv).replace("_sentiment.csv", "_priority.json"))
|
| 353 |
+
|
| 354 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 355 |
+
with open(output_path, 'w') as f:
|
| 356 |
+
json.dump(report_data, f, indent=4)
|
| 357 |
+
|
| 358 |
+
print(f"[📊] Priority index saved to {output_path}")
|
| 359 |
+
print(f"[📊] Priority score: {priority_score:.2f}/10 ({priority_level})")
|
| 360 |
+
return report_data
|
ai_api/library/sentiment_analyzer.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sentiment_analyzer.py
|
| 2 |
+
# Simple sentiment analyzer that doesn't require PyTorch
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import re
|
| 6 |
+
import random
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def simple_sentiment_analysis(text):
|
| 10 |
+
"""
|
| 11 |
+
A very simple rule-based sentiment analyzer for demonstration purposes.
|
| 12 |
+
Returns a sentiment label (neutral, positive, negative) and confidence score.
|
| 13 |
+
"""
|
| 14 |
+
if not text or len(text.strip()) < 15:
|
| 15 |
+
return "neutral", 0.5
|
| 16 |
+
|
| 17 |
+
# Convert to lowercase
|
| 18 |
+
text = text.lower()
|
| 19 |
+
|
| 20 |
+
# Define positive and negative word lists (Malay and English)
|
| 21 |
+
positive_words = [
|
| 22 |
+
"baik", "bagus", "hebat", "cantik", "indah", "suka", "gembira", "senang",
|
| 23 |
+
"setuju", "betul", "benar", "berkesan", "berjaya", "cemerlang", "positif",
|
| 24 |
+
"good", "great", "excellent", "amazing", "wonderful", "happy", "like", "love",
|
| 25 |
+
"agree", "correct", "true", "effective", "successful", "positive"
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
negative_words = [
|
| 29 |
+
"buruk", "teruk", "hodoh", "benci", "marah", "sedih", "kecewa", "susah",
|
| 30 |
+
"tidak setuju", "salah", "palsu", "gagal", "negatif", "masalah", "bahaya",
|
| 31 |
+
"bad", "terrible", "ugly", "hate", "angry", "sad", "disappointed", "difficult",
|
| 32 |
+
"disagree", "wrong", "false", "fail", "negative", "problem", "dangerous"
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# Count positive and negative words
|
| 36 |
+
positive_count = sum(1 for word in positive_words if re.search(r'\b' + re.escape(word) + r'\b', text))
|
| 37 |
+
negative_count = sum(1 for word in negative_words if re.search(r'\b' + re.escape(word) + r'\b', text))
|
| 38 |
+
|
| 39 |
+
# Determine sentiment
|
| 40 |
+
if positive_count > negative_count:
|
| 41 |
+
sentiment = "positive"
|
| 42 |
+
confidence = 0.5 + min(0.5, (positive_count - negative_count) / 10)
|
| 43 |
+
elif negative_count > positive_count:
|
| 44 |
+
sentiment = "negative"
|
| 45 |
+
confidence = 0.5 + min(0.5, (negative_count - positive_count) / 10)
|
| 46 |
+
else:
|
| 47 |
+
sentiment = "neutral"
|
| 48 |
+
confidence = 0.5
|
| 49 |
+
|
| 50 |
+
return sentiment, round(confidence, 4)
|
| 51 |
+
|
| 52 |
+
def run(csv_path, sentiment_output_path=None):
|
| 53 |
+
"""
|
| 54 |
+
Runs sentiment analysis on combined comment + post text from the input CSV.
|
| 55 |
+
Saves the result (with sentiment + confidence columns) to a new CSV.
|
| 56 |
+
"""
|
| 57 |
+
print(f"[📄] Reading dataset: {csv_path}")
|
| 58 |
+
df = pd.read_csv(csv_path)
|
| 59 |
+
|
| 60 |
+
# Combine comment and post text into a single field
|
| 61 |
+
df['combined_text'] = df['comment_text'].fillna('') + ". " + df['post_text'].fillna('')
|
| 62 |
+
df['combined_text'] = df['combined_text'].str.strip()
|
| 63 |
+
|
| 64 |
+
sentiments = []
|
| 65 |
+
confidences = []
|
| 66 |
+
|
| 67 |
+
print("[🔍] Running simple sentiment classification...")
|
| 68 |
+
for text in df['combined_text']:
|
| 69 |
+
sentiment, confidence = simple_sentiment_analysis(text)
|
| 70 |
+
sentiments.append(sentiment)
|
| 71 |
+
confidences.append(confidence)
|
| 72 |
+
|
| 73 |
+
# Add results to DataFrame
|
| 74 |
+
df['sentiment'] = sentiments
|
| 75 |
+
df['confidence'] = confidences
|
| 76 |
+
|
| 77 |
+
# Map sentiments to numeric values for compatibility with the rest of the system
|
| 78 |
+
sentiment_map = {
|
| 79 |
+
"neutral": 0,
|
| 80 |
+
"positive": 1,
|
| 81 |
+
"negative": 2
|
| 82 |
+
}
|
| 83 |
+
df['sentiment_value'] = df['sentiment'].map(sentiment_map)
|
| 84 |
+
|
| 85 |
+
# Determine the output path dynamically if not provided
|
| 86 |
+
if not sentiment_output_path:
|
| 87 |
+
sentiment_output_path = csv_path.replace(".csv", "_sentiment.csv")
|
| 88 |
+
|
| 89 |
+
df.to_csv(sentiment_output_path, index=False)
|
| 90 |
+
print(f"[💾] Sentiment analysis completed. Output saved to: {sentiment_output_path}")
|
| 91 |
+
|
ai_api/library/simple_keyword_extraction.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# simple_keyword_extraction.py
|
| 2 |
+
# Simple keyword extraction for the claim analysis system
|
| 3 |
+
|
| 4 |
+
import re
|
| 5 |
+
from collections import Counter
|
| 6 |
+
|
| 7 |
+
# Define Malay stopwords
|
| 8 |
+
MALAY_STOPWORDS = [
|
| 9 |
+
"ada", "adalah", "adanya", "adapun", "agak", "agaknya", "agar", "akan", "akankah", "akhir",
|
| 10 |
+
"akhiri", "akhirnya", "aku", "akulah", "amat", "amatlah", "anda", "andalah", "antar", "antara",
|
| 11 |
+
"antaranya", "apa", "apaan", "apabila", "apakah", "apalagi", "apatah", "artinya", "asal", "asalkan",
|
| 12 |
+
"atas", "atau", "ataukah", "ataupun", "awal", "awalnya", "bagai", "bagaikan", "bagaimana", "bagaimanakah",
|
| 13 |
+
"bagaimanapun", "bagi", "bagian", "bahkan", "bahwa", "bahwasanya", "baik", "bakal", "bakalan", "balik",
|
| 14 |
+
"banyak", "bapak", "baru", "bawah", "beberapa", "begini", "beginian", "beginikah", "beginilah", "begitu",
|
| 15 |
+
"begitukah", "begitulah", "begitupun", "bekerja", "belakang", "belakangan", "belum", "belumlah", "benar",
|
| 16 |
+
"benarkah", "benarlah", "berada", "berakhir", "berakhirlah", "berakhirnya", "berapa", "berapakah", "berapalah",
|
| 17 |
+
"berapapun", "berarti", "berawal", "berbagai", "berdatangan", "beri", "berikan", "berikut", "berikutnya",
|
| 18 |
+
"berjumlah", "berkali-kali", "berkata", "berkehendak", "berkeinginan", "berkenaan", "berlainan", "berlalu",
|
| 19 |
+
"berlangsung", "berlebihan", "bermacam", "bermacam-macam", "bermaksud", "bermula", "bersama", "bersama-sama",
|
| 20 |
+
"bersiap", "bersiap-siap", "bertanya", "bertanya-tanya", "berturut", "berturut-turut", "bertutur", "berujar",
|
| 21 |
+
"berupa", "besar", "betul", "betulkah", "biasa", "biasanya", "bila", "bilakah", "bisa", "bisakah", "boleh",
|
| 22 |
+
"bolehkah", "bolehlah", "buat", "bukan", "bukankah", "bukanlah", "bukannya", "bulan", "bung", "cara", "caranya",
|
| 23 |
+
"cukup", "cukupkah", "cukuplah", "cuma", "dahulu", "dalam", "dan", "dapat", "dari", "daripada", "datang",
|
| 24 |
+
"dekat", "demi", "demikian", "demikianlah", "dengan", "depan", "di", "dia", "diakhiri", "diakhirinya", "dialah",
|
| 25 |
+
"diantara", "diantaranya", "diberi", "diberikan", "diberikannya", "dibuat", "dibuatnya", "didapat", "didatangkan",
|
| 26 |
+
"digunakan", "diibaratkan", "diibaratkannya", "diingat", "diingatkan", "diinginkan", "dijawab", "dijelaskan",
|
| 27 |
+
"dijelaskannya", "dikarenakan", "dikatakan", "dikatakannya", "dikerjakan", "diketahui", "diketahuinya", "dikira",
|
| 28 |
+
"dilakukan", "dilalui", "dilihat", "dimaksud", "dimaksudkan", "dimaksudkannya", "dimaksudnya", "diminta",
|
| 29 |
+
"dimintai", "dimisalkan", "dimulai", "dimulailah", "dimulainya", "dimungkinkan", "dini", "dipastikan",
|
| 30 |
+
"diperbuat", "diperbuatnya", "dipergunakan", "diperkirakan", "diperlihatkan", "diperlukan", "diperlukannya",
|
| 31 |
+
"dipersoalkan", "dipertanyakan", "dipunyai", "diri", "dirinya", "disampaikan", "disebut", "disebutkan",
|
| 32 |
+
"disebutkannya", "disini", "disinilah", "ditambahkan", "ditandaskan", "ditanya", "ditanyai", "ditanyakan",
|
| 33 |
+
"ditegaskan", "ditujukan", "ditunjuk", "ditunjuki", "ditunjukkan", "ditunjukkannya", "ditunjuknya", "dituturkan",
|
| 34 |
+
"dituturkannya", "diucapkan", "diucapkannya", "diungkapkan", "dong", "dua", "dulu", "empat", "enggak", "enggaknya",
|
| 35 |
+
"entah", "entahlah", "guna", "gunakan", "hal", "hampir", "hanya", "hanyalah", "hari", "harus", "haruslah",
|
| 36 |
+
"harusnya", "hendak", "hendaklah", "hendaknya", "hingga", "ia", "ialah", "ibarat", "ibaratkan", "ibaratnya",
|
| 37 |
+
"ibu", "ikut", "ingat", "ingat-ingat", "ingin", "inginkah", "inginkan", "ini", "inikah", "inilah", "itu",
|
| 38 |
+
"itukah", "itulah", "jadi", "jadilah", "jadinya", "jangan", "jangankan", "janganlah", "jauh", "jawab",
|
| 39 |
+
"jawaban", "jawabnya", "jelas", "jelaskan", "jelaslah", "jelasnya", "jika", "jikalau", "juga", "jumlah",
|
| 40 |
+
"jumlahnya", "justru", "kala", "kalau", "kalaulah", "kalaupun", "kalian", "kami", "kamilah", "kamu", "kamulah",
|
| 41 |
+
"kan", "kapan", "kapankah", "kapanpun", "karena", "karenanya", "kasus", "kata", "katakan", "katakanlah",
|
| 42 |
+
"katanya", "ke", "keadaan", "kebetulan", "kecil", "kedua", "keduanya", "keinginan", "kelamaan", "kelihatan",
|
| 43 |
+
"kelihatannya", "kelima", "keluar", "kembali", "kemudian", "kemungkinan", "kemungkinannya", "kenapa", "kepada",
|
| 44 |
+
"kepadanya", "kesamaan", "keseluruhan", "keseluruhannya", "keterlaluan", "ketika", "khususnya", "kini", "kinilah",
|
| 45 |
+
"kira", "kira-kira", "kiranya", "kita", "kitalah", "kok", "kurang", "lagi", "lagian", "lah", "lain", "lainnya",
|
| 46 |
+
"lalu", "lama", "lamanya", "lanjut", "lanjutnya", "lebih", "lewat", "lima", "luar", "macam", "maka", "makanya",
|
| 47 |
+
"makin", "malah", "malahan", "mampu", "mampukah", "mana", "manakala", "manalagi", "masa", "masalah", "masalahnya",
|
| 48 |
+
"masih", "masihkah", "masing", "masing-masing", "mau", "maupun", "melainkan", "melakukan", "melalui", "melihat",
|
| 49 |
+
"melihatnya", "memang", "memastikan", "memberi", "memberikan", "membuat", "memerlukan", "memihak", "meminta",
|
| 50 |
+
"memintakan", "memisalkan", "memperbuat", "mempergunakan", "memperkirakan", "memperlihatkan", "mempersiapkan",
|
| 51 |
+
"mempersoalkan", "mempertanyakan", "mempunyai", "memulai", "memungkinkan", "menaiki", "menambahkan", "menandaskan",
|
| 52 |
+
"menanti", "menanti-nanti", "menantikan", "menanya", "menanyai", "menanyakan", "mendapat", "mendapatkan",
|
| 53 |
+
"mendatang", "mendatangi", "mendatangkan", "menegaskan", "mengakhiri", "mengapa", "mengatakan", "mengatakannya",
|
| 54 |
+
"mengenai", "mengerjakan", "mengetahui", "menggunakan", "menghendaki", "mengibaratkan", "mengibaratkannya",
|
| 55 |
+
"mengingat", "mengingatkan", "menginginkan", "mengira", "mengucapkan", "mengucapkannya", "mengungkapkan",
|
| 56 |
+
"menjadi", "menjawab", "menjelaskan", "menuju", "menunjuk", "menunjuki", "menunjukkan", "menunjuknya", "menurut",
|
| 57 |
+
"menuturkan", "menyampaikan", "menyangkut", "menyatakan", "menyebutkan", "menyeluruh", "menyiapkan", "merasa",
|
| 58 |
+
"mereka", "merekalah", "merupakan", "meski", "meskipun", "meyakini", "meyakinkan", "minta", "mirip", "misal",
|
| 59 |
+
"misalkan", "misalnya", "mula", "mulai", "mulailah", "mulanya", "mungkin", "mungkinkah", "nah", "naik", "namun",
|
| 60 |
+
"nanti", "nantinya", "nyaris", "nyatanya", "oleh", "olehnya", "pada", "padahal", "padanya", "pak", "paling",
|
| 61 |
+
"panjang", "pantas", "para", "pasti", "pastilah", "penting", "pentingnya", "per", "percuma", "perlu", "perlukah",
|
| 62 |
+
"perlunya", "pernah", "persoalan", "pertama", "pertama-tama", "pertanyaan", "pertanyakan", "pihak", "pihaknya",
|
| 63 |
+
"pukul", "pula", "pun", "punya", "rasa", "rasanya", "rata", "rupanya", "saat", "saatnya", "saja", "sajalah",
|
| 64 |
+
"saling", "sama", "sama-sama", "sambil", "sampai", "sampai-sampai", "sampaikan", "sana", "sangat", "sangatlah",
|
| 65 |
+
"satu", "saya", "sayalah", "se", "sebab", "sebabnya", "sebagai", "sebagaimana", "sebagainya", "sebagian",
|
| 66 |
+
"sebaik", "sebaik-baiknya", "sebaiknya", "sebaliknya", "sebanyak", "sebegini", "sebegitu", "sebelum", "sebelumnya",
|
| 67 |
+
"sebenarnya", "seberapa", "sebesar", "sebetulnya", "sebisanya", "sebuah", "sebut", "sebutlah", "sebutnya",
|
| 68 |
+
"secara", "secukupnya", "sedang", "sedangkan", "sedemikian", "sedikit", "sedikitnya", "seenaknya", "segala",
|
| 69 |
+
"segalanya", "segera", "seharusnya", "sehingga", "seingat", "sejak", "sejauh", "sejenak", "sejumlah", "sekadar",
|
| 70 |
+
"sekadarnya", "sekali", "sekali-kali", "sekalian", "sekaligus", "sekalipun", "sekarang", "sekarang", "sekecil",
|
| 71 |
+
"seketika", "sekiranya", "sekitar", "sekitarnya", "sekurang-kurangnya", "sekurangnya", "sela", "selain", "selaku",
|
| 72 |
+
"selalu", "selama", "selama-lamanya", "selamanya", "selanjutnya", "seluruh", "seluruhnya", "semacam", "semakin",
|
| 73 |
+
"semampu", "semampunya", "semasa", "semasih", "semata", "semata-mata", "semaunya", "sementara", "semisal",
|
| 74 |
+
"semisalnya", "sempat", "semua", "semuanya", "semula", "sendiri", "sendirian", "sendirinya", "seolah",
|
| 75 |
+
"seolah-olah", "seorang", "sepanjang", "sepantasnya", "sepantasnyalah", "seperlunya", "seperti", "sepertinya",
|
| 76 |
+
"sepihak", "sering", "seringnya", "serta", "serupa", "sesaat", "sesama", "sesampai", "sesegera", "sesekali",
|
| 77 |
+
"seseorang", "sesuatu", "sesuatunya", "sesudah", "sesudahnya", "setelah", "setempat", "setengah", "seterusnya",
|
| 78 |
+
"setiap", "setiba", "setibanya", "setidak-tidaknya", "setidaknya", "setinggi", "seusai", "sewaktu", "siap",
|
| 79 |
+
"siapa", "siapakah", "siapapun", "sini", "sinilah", "soal", "soalnya", "suatu", "sudah", "sudahkah", "sudahlah",
|
| 80 |
+
"supaya", "tadi", "tadinya", "tahu", "tahun", "tak", "tambah", "tambahnya", "tampak", "tampaknya", "tandas",
|
| 81 |
+
"tandasnya", "tanpa", "tanya", "tanyakan", "tanyanya", "tapi", "tegas", "tegasnya", "telah", "tempat", "tengah",
|
| 82 |
+
"tentang", "tentu", "tentulah", "tentunya", "tepat", "terakhir", "terasa", "terbanyak", "terdahulu", "terdapat",
|
| 83 |
+
"terdiri", "terhadap", "terhadapnya", "teringat", "teringat-ingat", "terjadi", "terjadilah", "terjadinya",
|
| 84 |
+
"terkira", "terlalu", "terlebih", "terlihat", "termasuk", "ternyata", "tersampaikan", "tersebut", "tersebutlah",
|
| 85 |
+
"tertentu", "tertuju", "terus", "terutama", "tetap", "tetapi", "tiap", "tiba", "tiba-tiba", "tidak", "tidakkah",
|
| 86 |
+
"tidaklah", "tiga", "tinggi", "toh", "tunjuk", "turut", "tutur", "tuturnya", "ucap", "ucapnya", "ujar", "ujarnya",
|
| 87 |
+
"umum", "umumnya", "ungkap", "ungkapnya", "untuk", "usah", "usai", "waduh", "wah", "wahai", "waktu", "waktunya",
|
| 88 |
+
"walau", "walaupun", "wong", "yaitu", "yakin", "yakni", "yang", "ke", "pada", "ini", "itu", "juga", "dari", "dalam",
|
| 89 |
+
"akan", "jika", "maka", "karena", "oleh", "dengan", "atau", "secara", "untuk", "adalah", "sebagai", "bahwa", "hanya",
|
| 90 |
+
"namun", "tetapi", "ketika", "setelah", "sebelum", "selama", "sejak", "hingga", "sampai", "tentang", "seperti",
|
| 91 |
+
"terhadap", "melalui", "menurut", "berdasarkan", "mengenai", "antara", "di", "si", "sang", "para", "the", "of", "and",
|
| 92 |
+
"a", "to", "in", "that", "it", "with", "as", "for", "on", "was", "is", "by", "at", "this", "an", "are", "not", "from",
|
| 93 |
+
"but", "have", "had", "has", "be", "been", "were", "which", "or", "we", "their", "his", "her", "they", "its", "he",
|
| 94 |
+
"she", "you", "my", "all", "can", "would", "could", "should", "may", "might", "must", "shall", "will", "them", "there",
|
| 95 |
+
"these", "those", "some", "any", "no", "nor", "so", "such", "than", "then", "thus", "up", "down", "out", "about", "into",
|
| 96 |
+
"over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "what", "who",
|
| 97 |
+
"whom", "this", "that", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
|
| 98 |
+
"does", "did", "doing", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
|
| 99 |
+
"yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them",
|
| 100 |
+
"their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
|
| 101 |
+
"was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the",
|
| 102 |
+
"and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
|
| 103 |
+
"between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out",
|
| 104 |
+
"on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all",
|
| 105 |
+
"any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
|
| 106 |
+
"than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
def extract_keywords(text, top_n=10):
|
| 110 |
+
"""
|
| 111 |
+
Extract keywords from text using a simple frequency-based approach
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
text (str): Text to extract keywords from
|
| 115 |
+
top_n (int): Number of keywords to extract
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
list: List of extracted keywords
|
| 119 |
+
"""
|
| 120 |
+
# Convert to lowercase
|
| 121 |
+
text = text.lower()
|
| 122 |
+
|
| 123 |
+
# Remove punctuation and split into words
|
| 124 |
+
words = re.findall(r'\b\w+\b', text)
|
| 125 |
+
|
| 126 |
+
# Remove stopwords
|
| 127 |
+
words = [word for word in words if word not in MALAY_STOPWORDS and len(word) > 2]
|
| 128 |
+
|
| 129 |
+
# Count word frequencies
|
| 130 |
+
word_counts = Counter(words)
|
| 131 |
+
|
| 132 |
+
# Get top N keywords
|
| 133 |
+
keywords = [word for word, count in word_counts.most_common(top_n)]
|
| 134 |
+
|
| 135 |
+
# If we have fewer than top_n keywords, return what we have
|
| 136 |
+
return keywords
|
| 137 |
+
|
| 138 |
+
def optimize_keywords_for_platforms(keywords):
|
| 139 |
+
"""
|
| 140 |
+
Optimize keywords for different platforms
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
keywords (list): List of keywords
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
dict: Dictionary with optimized keywords for each platform
|
| 147 |
+
"""
|
| 148 |
+
return {
|
| 149 |
+
"tiktok": keywords[:3],
|
| 150 |
+
"web_search": keywords[:5]
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
def detect_claim_type(text):
|
| 154 |
+
"""
|
| 155 |
+
Detect the type of claim based on keywords
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
text (str): The claim text
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
str: The type of claim
|
| 162 |
+
"""
|
| 163 |
+
text = text.lower()
|
| 164 |
+
|
| 165 |
+
# Define keyword sets for different claim types
|
| 166 |
+
economic_keywords = ["ekonomi", "cukai", "harga", "kewangan", "bank", "ringgit", "subsidi", "kos", "bayaran", "hutang"]
|
| 167 |
+
political_keywords = ["kerajaan", "politik", "perdana menteri", "menteri", "parlimen", "pilihan raya", "parti", "kabinet"]
|
| 168 |
+
health_keywords = ["kesihatan", "penyakit", "hospital", "vaksin", "ubat", "doktor", "covid", "virus", "pandemik"]
|
| 169 |
+
social_keywords = ["sosial", "masyarakat", "pendidikan", "sekolah", "universiti", "pelajar", "guru", "agama"]
|
| 170 |
+
security_keywords = ["keselamatan", "polis", "tentera", "jenayah", "penjenayah", "senjata", "serangan"]
|
| 171 |
+
|
| 172 |
+
# Count matches for each category
|
| 173 |
+
economic_count = sum(1 for keyword in economic_keywords if keyword in text)
|
| 174 |
+
political_count = sum(1 for keyword in political_keywords if keyword in text)
|
| 175 |
+
health_count = sum(1 for keyword in health_keywords if keyword in text)
|
| 176 |
+
social_count = sum(1 for keyword in social_keywords if keyword in text)
|
| 177 |
+
security_count = sum(1 for keyword in security_keywords if keyword in text)
|
| 178 |
+
|
| 179 |
+
# Determine the dominant category
|
| 180 |
+
counts = {
|
| 181 |
+
"Ekonomi": economic_count,
|
| 182 |
+
"Politik": political_count,
|
| 183 |
+
"Kesihatan": health_count,
|
| 184 |
+
"Sosial": social_count,
|
| 185 |
+
"Keselamatan": security_count
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
# Get the category with the highest count
|
| 189 |
+
dominant_category = max(counts, key=counts.get)
|
| 190 |
+
|
| 191 |
+
# If no matches, return "Umum"
|
| 192 |
+
if counts[dominant_category] == 0:
|
| 193 |
+
return "Umum"
|
| 194 |
+
|
| 195 |
+
return dominant_category
|
| 196 |
+
|
| 197 |
+
if __name__ == "__main__":
|
| 198 |
+
# Test the function
|
| 199 |
+
test_text = "Perkenal Cukai Khas Minyak Sawit Mentah Adalah Cadangan Sebuah Persatuan, Bukannya Kerajaan"
|
| 200 |
+
keywords = extract_keywords(test_text)
|
| 201 |
+
print(f"Extracted keywords: {keywords}")
|
| 202 |
+
|
| 203 |
+
optimized = optimize_keywords_for_platforms(keywords)
|
| 204 |
+
print(f"Optimized for TikTok: {optimized['tiktok']}")
|
| 205 |
+
print(f"Optimized for web search: {optimized['web_search']}")
|
ai_api/library/websearch.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
run_web_search.py
|
| 3 |
+
Module for running web searches and saving results
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
def run(keywords, output_path, num_results=5, use_serpapi=True, use_serper=True, use_duckduckgo=True, full_claim=None):
|
| 11 |
+
"""
|
| 12 |
+
Run web search for keywords and save results to CSV
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
keywords (list): List of keywords to search for
|
| 16 |
+
output_path (str): Path to save results
|
| 17 |
+
num_results (int): Number of results per keyword
|
| 18 |
+
use_serpapi (bool): Whether to use SerpApi
|
| 19 |
+
use_serper (bool): Whether to use Serper.dev
|
| 20 |
+
use_duckduckgo (bool): Whether to use DuckDuckGo
|
| 21 |
+
full_claim (str): The full claim text to use as a search query
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
int: Number of results saved
|
| 25 |
+
"""
|
| 26 |
+
# Import search functions
|
| 27 |
+
try:
|
| 28 |
+
from web_search import search_serpapi, search_serper, search_duckduckgo, get_google_trends
|
| 29 |
+
except ImportError:
|
| 30 |
+
print("Error importing web_search module. Make sure it exists and is accessible.")
|
| 31 |
+
return 0
|
| 32 |
+
|
| 33 |
+
# Create search queries
|
| 34 |
+
all_results = []
|
| 35 |
+
|
| 36 |
+
# Always use the full claim directly if available
|
| 37 |
+
if full_claim:
|
| 38 |
+
print(f"Using full claim as direct search query: '{full_claim}'")
|
| 39 |
+
|
| 40 |
+
# Search using SerpApi with the exact claim
|
| 41 |
+
if use_serpapi:
|
| 42 |
+
print("Searching with SerpApi (exact claim)...")
|
| 43 |
+
serpapi_results = search_serpapi(full_claim, num_results=num_results)
|
| 44 |
+
if serpapi_results:
|
| 45 |
+
print(f"Found {len(serpapi_results)} results from SerpApi (exact claim)")
|
| 46 |
+
all_results.extend(serpapi_results)
|
| 47 |
+
else:
|
| 48 |
+
print("No results from SerpApi (exact claim)")
|
| 49 |
+
|
| 50 |
+
# Search using Serper.dev with the exact claim
|
| 51 |
+
if use_serper:
|
| 52 |
+
print("Searching with Serper.dev (exact claim)...")
|
| 53 |
+
serper_results = search_serper(full_claim, num_results=num_results)
|
| 54 |
+
if serper_results:
|
| 55 |
+
print(f"Found {len(serper_results)} results from Serper.dev (exact claim)")
|
| 56 |
+
all_results.extend(serper_results)
|
| 57 |
+
else:
|
| 58 |
+
print("No results from Serper.dev (exact claim)")
|
| 59 |
+
|
| 60 |
+
# For crime-related claims, also try targeted queries
|
| 61 |
+
crime_related = any(term in full_claim.lower() for term in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"])
|
| 62 |
+
kelantan_related = "kelantan" in full_claim.lower()
|
| 63 |
+
|
| 64 |
+
if crime_related and kelantan_related:
|
| 65 |
+
# Check if this is about sexual crimes or ammunition
|
| 66 |
+
ammunition_related = any(term in full_claim.lower() for term in ["kelongsong", "peluru", "senjata", "tan"])
|
| 67 |
+
|
| 68 |
+
if ammunition_related:
|
| 69 |
+
targeted_queries = [
|
| 70 |
+
"50 tan kelongsong peluru ditemui",
|
| 71 |
+
"kilang haram proses kelongsong peluru",
|
| 72 |
+
"penemuan kelongsong peluru di kilang",
|
| 73 |
+
"kelongsong peluru musuh negara"
|
| 74 |
+
]
|
| 75 |
+
else:
|
| 76 |
+
# Default to sexual crime queries
|
| 77 |
+
targeted_queries = [
|
| 78 |
+
"statistik jenayah seksual di kelantan",
|
| 79 |
+
"kes rogol dan sumbang mahram di kelantan meningkat",
|
| 80 |
+
"pdrm kelantan lapor kes rogol"
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
for query in targeted_queries:
|
| 84 |
+
print(f"Using targeted query: '{query}'")
|
| 85 |
+
|
| 86 |
+
# Search using SerpApi
|
| 87 |
+
if use_serpapi:
|
| 88 |
+
print(f"Searching with SerpApi (targeted query: {query})...")
|
| 89 |
+
serpapi_results = search_serpapi(query, num_results=num_results//2) # Use fewer results for each targeted query
|
| 90 |
+
if serpapi_results:
|
| 91 |
+
print(f"Found {len(serpapi_results)} results from SerpApi (targeted query)")
|
| 92 |
+
all_results.extend(serpapi_results)
|
| 93 |
+
else:
|
| 94 |
+
print(f"No results from SerpApi (targeted query: {query})")
|
| 95 |
+
|
| 96 |
+
# Search using Serper.dev
|
| 97 |
+
if use_serper:
|
| 98 |
+
print(f"Searching with Serper.dev (targeted query: {query})...")
|
| 99 |
+
serper_results = search_serper(query, num_results=num_results//2) # Use fewer results for each targeted query
|
| 100 |
+
if serper_results:
|
| 101 |
+
print(f"Found {len(serper_results)} results from Serper.dev (targeted query)")
|
| 102 |
+
all_results.extend(serper_results)
|
| 103 |
+
else:
|
| 104 |
+
print(f"No results from Serper.dev (targeted query: {query})")
|
| 105 |
+
else:
|
| 106 |
+
# For other claims, use the original approach with keywords
|
| 107 |
+
# 1. Full claim query (if available)
|
| 108 |
+
full_claim_query = f'"{full_claim}"' if full_claim else None
|
| 109 |
+
|
| 110 |
+
# 2. Keyword-based query
|
| 111 |
+
search_terms = []
|
| 112 |
+
for kw in keywords:
|
| 113 |
+
# If keyword contains spaces (multi-word phrase), wrap in quotes
|
| 114 |
+
if " " in kw:
|
| 115 |
+
search_terms.append(f'"{kw}"')
|
| 116 |
+
else:
|
| 117 |
+
# For single words, don't use quotes to get broader results
|
| 118 |
+
search_terms.append(kw)
|
| 119 |
+
|
| 120 |
+
keyword_query = " OR ".join(search_terms)
|
| 121 |
+
|
| 122 |
+
# Search using full claim first (if available)
|
| 123 |
+
if full_claim_query:
|
| 124 |
+
print(f"Searching with full claim: {full_claim_query}")
|
| 125 |
+
|
| 126 |
+
# Search using SerpApi
|
| 127 |
+
if use_serpapi:
|
| 128 |
+
print("Searching with SerpApi (full claim)...")
|
| 129 |
+
serpapi_results = search_serpapi(full_claim, num_results=num_results)
|
| 130 |
+
if serpapi_results:
|
| 131 |
+
print(f"Found {len(serpapi_results)} results from SerpApi (full claim)")
|
| 132 |
+
all_results.extend(serpapi_results)
|
| 133 |
+
else:
|
| 134 |
+
print("No results from SerpApi (full claim)")
|
| 135 |
+
|
| 136 |
+
# Search using Serper.dev
|
| 137 |
+
if use_serper:
|
| 138 |
+
print("Searching with Serper.dev (full claim)...")
|
| 139 |
+
serper_results = search_serper(full_claim, num_results=num_results)
|
| 140 |
+
if serper_results:
|
| 141 |
+
print(f"Found {len(serper_results)} results from Serper.dev (full claim)")
|
| 142 |
+
all_results.extend(serper_results)
|
| 143 |
+
else:
|
| 144 |
+
print("No results from Serper.dev (full claim)")
|
| 145 |
+
|
| 146 |
+
# Search using keyword query as fallback
|
| 147 |
+
if not all_results or len(all_results) < num_results:
|
| 148 |
+
print(f"Searching with keyword query: {keyword_query}")
|
| 149 |
+
|
| 150 |
+
# Search using SerpApi
|
| 151 |
+
if use_serpapi:
|
| 152 |
+
print("Searching with SerpApi (keywords)...")
|
| 153 |
+
serpapi_results = search_serpapi(keyword_query, num_results=num_results)
|
| 154 |
+
if serpapi_results:
|
| 155 |
+
print(f"Found {len(serpapi_results)} results from SerpApi (keywords)")
|
| 156 |
+
all_results.extend(serpapi_results)
|
| 157 |
+
else:
|
| 158 |
+
print("No results from SerpApi (keywords)")
|
| 159 |
+
|
| 160 |
+
# Search using Serper.dev
|
| 161 |
+
if use_serper:
|
| 162 |
+
print("Searching with Serper.dev (keywords)...")
|
| 163 |
+
serper_results = search_serper(keyword_query, num_results=num_results)
|
| 164 |
+
if serper_results:
|
| 165 |
+
print(f"Found {len(serper_results)} results from Serper.dev (keywords)")
|
| 166 |
+
all_results.extend(serper_results)
|
| 167 |
+
else:
|
| 168 |
+
print("No results from Serper.dev (keywords)")
|
| 169 |
+
|
| 170 |
+
# Add DuckDuckGo results
|
| 171 |
+
if use_duckduckgo:
|
| 172 |
+
query_to_use = full_claim if full_claim else keyword_query
|
| 173 |
+
print(f"Searching with DuckDuckGo using: {query_to_use}")
|
| 174 |
+
duckduckgo_results = search_duckduckgo(query_to_use, num_results=num_results)
|
| 175 |
+
if duckduckgo_results:
|
| 176 |
+
print(f"Found {len(duckduckgo_results)} results from DuckDuckGo")
|
| 177 |
+
all_results.extend(duckduckgo_results)
|
| 178 |
+
else:
|
| 179 |
+
print("No results from DuckDuckGo")
|
| 180 |
+
|
| 181 |
+
# Add Google Trends data
|
| 182 |
+
trends_data = get_google_trends(keywords)
|
| 183 |
+
|
| 184 |
+
# Convert to DataFrame
|
| 185 |
+
if all_results:
|
| 186 |
+
# Remove duplicates based on URL
|
| 187 |
+
unique_results = []
|
| 188 |
+
seen_urls = set()
|
| 189 |
+
|
| 190 |
+
for result in all_results:
|
| 191 |
+
url = result.get('link', '')
|
| 192 |
+
if url and url not in seen_urls:
|
| 193 |
+
seen_urls.add(url)
|
| 194 |
+
unique_results.append(result)
|
| 195 |
+
|
| 196 |
+
print(f"Removed {len(all_results) - len(unique_results)} duplicate results")
|
| 197 |
+
|
| 198 |
+
df = pd.DataFrame(unique_results)
|
| 199 |
+
|
| 200 |
+
# Add additional columns to match the format expected by the sentiment analyzer
|
| 201 |
+
df['platform'] = 'web'
|
| 202 |
+
df['username'] = df['source']
|
| 203 |
+
df['post_text'] = df['snippet']
|
| 204 |
+
df['post_url'] = df['link']
|
| 205 |
+
df['likes'] = 0
|
| 206 |
+
df['shares'] = 0
|
| 207 |
+
df['comments_count'] = 0
|
| 208 |
+
df['comment_text'] = ''
|
| 209 |
+
df['combined_text'] = df['title'] + ' ' + df['snippet']
|
| 210 |
+
df['date'] = datetime.now().strftime('%Y-%m-%d')
|
| 211 |
+
|
| 212 |
+
# Create output directory if it doesn't exist
|
| 213 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 214 |
+
|
| 215 |
+
# Save to CSV
|
| 216 |
+
df.to_csv(output_path, index=False)
|
| 217 |
+
print(f"Saved {len(df)} web search results to {output_path}")
|
| 218 |
+
return len(df)
|
| 219 |
+
else:
|
| 220 |
+
print("No web search results found")
|
| 221 |
+
return 0
|
| 222 |
+
|
| 223 |
+
# Test the module
|
| 224 |
+
if __name__ == "__main__":
|
| 225 |
+
import sys
|
| 226 |
+
|
| 227 |
+
# Get keywords from command line or use default
|
| 228 |
+
if len(sys.argv) > 1:
|
| 229 |
+
keywords = sys.argv[1:]
|
| 230 |
+
full_claim = " ".join(sys.argv[1:])
|
| 231 |
+
else:
|
| 232 |
+
keywords = ["polis", "kelantan", "sumbang mahram", "rogol"]
|
| 233 |
+
full_claim = "Polis Kelantan bimbang kes sumbang mahram dan rogol di Kelantan"
|
| 234 |
+
|
| 235 |
+
# Run web search
|
| 236 |
+
output_path = "output/web_search_results.csv"
|
| 237 |
+
run_web_search(keywords, output_path, num_results=10, full_claim=full_claim)
|
ai_api/middleware.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# middleware.py
|
| 2 |
+
import hashlib
|
| 3 |
+
import hmac
|
| 4 |
+
from django.http import JsonResponse
|
| 5 |
+
from ai_api.models import APIClient
|
| 6 |
+
|
| 7 |
+
class HMACAuthMiddleware:
|
| 8 |
+
def __init__(self, get_response):
|
| 9 |
+
self.get_response = get_response
|
| 10 |
+
|
| 11 |
+
def __call__(self, request):
|
| 12 |
+
# if request.path.startswith('/admin/'):
|
| 13 |
+
# return self.get_response(request)
|
| 14 |
+
if not request.path.startswith('/api/'):
|
| 15 |
+
return self.get_response(request)
|
| 16 |
+
|
| 17 |
+
client_id = request.headers.get('X-Client-ID')
|
| 18 |
+
signature = request.headers.get('X-Signature')
|
| 19 |
+
|
| 20 |
+
if not client_id or not signature:
|
| 21 |
+
return JsonResponse({'error': 'Missing credentials'}, status=401)
|
| 22 |
+
|
| 23 |
+
from ai_api.models import APIClient
|
| 24 |
+
try:
|
| 25 |
+
client = APIClient.objects.get(client_id=client_id)
|
| 26 |
+
except APIClient.DoesNotExist:
|
| 27 |
+
return JsonResponse({'error': 'Invalid client ID'}, status=401)
|
| 28 |
+
|
| 29 |
+
expected_signature = hmac.new(
|
| 30 |
+
client.secret_key.encode(),
|
| 31 |
+
request.body,
|
| 32 |
+
hashlib.sha256
|
| 33 |
+
).hexdigest()
|
| 34 |
+
|
| 35 |
+
if not hmac.compare_digest(expected_signature, signature):
|
| 36 |
+
return JsonResponse({'error': 'Invalid signature'}, status=401)
|
| 37 |
+
|
| 38 |
+
request.api_client = client
|
| 39 |
+
return self.get_response(request)
|
| 40 |
+
|
ai_api/migrations/0001_initial.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated by Django 4.2.20 on 2025-05-08 00:50
|
| 2 |
+
|
| 3 |
+
from django.db import migrations, models
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Migration(migrations.Migration):
|
| 7 |
+
|
| 8 |
+
initial = True
|
| 9 |
+
|
| 10 |
+
dependencies = [
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
operations = [
|
| 14 |
+
migrations.CreateModel(
|
| 15 |
+
name='APIClient',
|
| 16 |
+
fields=[
|
| 17 |
+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
| 18 |
+
('name', models.CharField(max_length=100, unique=True)),
|
| 19 |
+
('client_id', models.CharField(editable=False, max_length=32, unique=True)),
|
| 20 |
+
('secret_key', models.CharField(editable=False, max_length=64)),
|
| 21 |
+
('created_at', models.DateTimeField(auto_now_add=True)),
|
| 22 |
+
],
|
| 23 |
+
),
|
| 24 |
+
]
|
ai_api/migrations/__init__.py
ADDED
|
File without changes
|
ai_api/models.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.db import models
|
| 2 |
+
import secrets
|
| 3 |
+
|
| 4 |
+
class APIClient(models.Model):
|
| 5 |
+
name = models.CharField(max_length=100, unique=True)
|
| 6 |
+
client_id = models.CharField(max_length=32, unique=True, editable=False)
|
| 7 |
+
secret_key = models.CharField(max_length=64, editable=False)
|
| 8 |
+
created_at = models.DateTimeField(auto_now_add=True)
|
| 9 |
+
|
| 10 |
+
def save(self, *args, **kwargs):
|
| 11 |
+
if not self.client_id:
|
| 12 |
+
self.client_id = secrets.token_hex(16)
|
| 13 |
+
if not self.secret_key:
|
| 14 |
+
self.secret_key = secrets.token_hex(32)
|
| 15 |
+
super().save(*args, **kwargs)
|
| 16 |
+
|
| 17 |
+
def __str__(self):
|
| 18 |
+
return self.name
|
ai_api/request_serializer.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rest_framework import serializers
|
| 2 |
+
|
| 3 |
+
class TranscriptionRequestSerializer(serializers.Serializer):
|
| 4 |
+
url = serializers.URLField(required=False, allow_null=True)
|
| 5 |
+
media = serializers.FileField(required=False, allow_null=True)
|
| 6 |
+
|
| 7 |
+
def validate(self, attrs):
|
| 8 |
+
url = attrs.get('url')
|
| 9 |
+
media = attrs.get('media')
|
| 10 |
+
|
| 11 |
+
if not url and not media:
|
| 12 |
+
raise serializers.ValidationError("Either 'url' or 'media' must be provided.")
|
| 13 |
+
|
| 14 |
+
return attrs
|
| 15 |
+
|
| 16 |
+
def validate_media(self, file):
|
| 17 |
+
if file is None:
|
| 18 |
+
return file
|
| 19 |
+
|
| 20 |
+
allowed_types = ['audio/', 'video/']
|
| 21 |
+
content_type = getattr(file, 'content_type', '')
|
| 22 |
+
|
| 23 |
+
if not any(content_type.startswith(t) for t in allowed_types):
|
| 24 |
+
raise serializers.ValidationError("Only audio or video files are allowed.")
|
| 25 |
+
|
| 26 |
+
return file
|
| 27 |
+
|
| 28 |
+
class ClassificationRequestSerializer(serializers.Serializer):
|
| 29 |
+
claim = serializers.CharField()
|
| 30 |
+
|
ai_api/templates/base-copy.html
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- templates/base.html -->
|
| 2 |
+
<!DOCTYPE html>
|
| 3 |
+
<html lang="en">
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>{% block title %}My Django Project{% endblock %}</title>
|
| 8 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
| 9 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
|
| 10 |
+
</head>
|
| 11 |
+
<body>
|
| 12 |
+
<!-- Navbar (optional) -->
|
| 13 |
+
<nav class="navbar navbar-expand-lg navbar-light bg-light ps-2">
|
| 14 |
+
<a class="navbar-brand" href="/">Home</a>
|
| 15 |
+
</nav>
|
| 16 |
+
|
| 17 |
+
<!-- Main content area -->
|
| 18 |
+
<div class="container m-2">
|
| 19 |
+
{% block content %}{% endblock %}
|
| 20 |
+
</div>
|
| 21 |
+
|
| 22 |
+
<!-- Footer (optional) -->
|
| 23 |
+
<footer class="bg-light text-center py-3">
|
| 24 |
+
<p>© 2025 BERNAMA Fact Check</p>
|
| 25 |
+
</footer>
|
| 26 |
+
|
| 27 |
+
<!-- jQuery Library -->
|
| 28 |
+
<script src="https://code.jquery.com/jquery-3.6.4.min.js"
|
| 29 |
+
integrity="sha256-oP6HI9z1XaZNBrJURtCoUT5SUnxFr8s3BzRl+cbzUq8="
|
| 30 |
+
crossorigin="anonymous"></script>
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
{% block scripts %}{% endblock %}
|
| 34 |
+
</body>
|
| 35 |
+
</html>
|
ai_api/templates/base.html
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>{% block title %}BERNAMA Fact Check{% endblock %}</title>
|
| 7 |
+
{% load static %}
|
| 8 |
+
|
| 9 |
+
<!-- Bootstrap CSS -->
|
| 10 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
| 11 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css" integrity="sha512-...hash..." crossorigin="anonymous" referrerpolicy="no-referrer" />
|
| 12 |
+
<link rel="stylesheet" href="{% static 'js/DataTables/datatables.min.css' %}">
|
| 13 |
+
|
| 14 |
+
<link rel="icon" href="{% static 'favicon.ico' %}" type="image/x-icon">
|
| 15 |
+
<!-- Optional: Custom dark mode toggle -->
|
| 16 |
+
<style>
|
| 17 |
+
body.dark-mode {
|
| 18 |
+
background-color: #121212;
|
| 19 |
+
color: #f8f9fa;
|
| 20 |
+
}
|
| 21 |
+
body.dark-mode .bg-light {
|
| 22 |
+
background-color: #1f1f1f !important;
|
| 23 |
+
}
|
| 24 |
+
body.dark-mode .text-muted {
|
| 25 |
+
color: #adb5bd !important;
|
| 26 |
+
}
|
| 27 |
+
</style>
|
| 28 |
+
</head>
|
| 29 |
+
<body class="dark-mode">
|
| 30 |
+
|
| 31 |
+
<!-- Hero Section -->
|
| 32 |
+
<section class="py-5 bg-light text-center shadow">
|
| 33 |
+
<div class="container">
|
| 34 |
+
<h1 class="display-5 fw-bold mb-3">AI Feature Testing Bed</h1>
|
| 35 |
+
<p class="lead text-muted mb-4">Experiment with cutting-edge AI modules like Face Recognition and Speech Transcription in one place.</p>
|
| 36 |
+
<a href="/#features" class="btn btn-primary btn-lg">Explore Features</a>
|
| 37 |
+
</div>
|
| 38 |
+
</section>
|
| 39 |
+
|
| 40 |
+
<!-- Main Section -->
|
| 41 |
+
<section class="py-5">
|
| 42 |
+
<div class="container">
|
| 43 |
+
{% block content %}{% endblock %}
|
| 44 |
+
</div>
|
| 45 |
+
</section>
|
| 46 |
+
|
| 47 |
+
<!-- Footer -->
|
| 48 |
+
<footer class="text-center py-4 text-muted">
|
| 49 |
+
© 2025 BERNAMA Fact Check. All rights reserved.
|
| 50 |
+
</footer>
|
| 51 |
+
|
| 52 |
+
<!-- Bootstrap JS Bundle (with Popper) -->
|
| 53 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
|
| 54 |
+
|
| 55 |
+
<!-- jQuery -->
|
| 56 |
+
<script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
|
| 57 |
+
<script src="{% static 'js/DataTables/datatables.min.js' %}"></script>
|
| 58 |
+
|
| 59 |
+
{% block scripts %}{% endblock %}
|
| 60 |
+
</body>
|
| 61 |
+
</html>
|
ai_api/templates/classification.html
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends 'base.html' %}
|
| 2 |
+
|
| 3 |
+
{% block content %}
|
| 4 |
+
<div class="container py-4">
|
| 5 |
+
<h2 class="mb-4 fw-bold text-white">Classification</h2>
|
| 6 |
+
|
| 7 |
+
<form id="classificationForm" method="POST">
|
| 8 |
+
{% csrf_token %}
|
| 9 |
+
{{ form.as_p }}
|
| 10 |
+
<button type="submit" class="btn btn-primary mt-3">
|
| 11 |
+
Submit
|
| 12 |
+
</button>
|
| 13 |
+
</form>
|
| 14 |
+
|
| 15 |
+
<!-- Progress Bar -->
|
| 16 |
+
<div id="progressContainer" class="mt-4" style="display: none;">
|
| 17 |
+
<div class="progress">
|
| 18 |
+
<div id="progressBar" class="progress-bar progress-bar-striped progress-bar-animated" role="progressbar" style="width: 0%"></div>
|
| 19 |
+
</div>
|
| 20 |
+
<p id="progressText" class="text-white mt-2"></p>
|
| 21 |
+
</div>
|
| 22 |
+
|
| 23 |
+
<!-- Results Container -->
|
| 24 |
+
<div id="resultsContainer" style="display: none;">
|
| 25 |
+
<div class="alert alert-secondary text-uppercase small mt-4">
|
| 26 |
+
<p><strong>Category:</strong> <span id="category"></span></p>
|
| 27 |
+
<p><strong>Keywords:</strong> <span id="keywords"></span></p>
|
| 28 |
+
<p><strong>Priority Index:</strong> <span id="priorityScore"></span>/10</p>
|
| 29 |
+
</div>
|
| 30 |
+
|
| 31 |
+
<div class="row g-4 mt-2" id="priorityCards">
|
| 32 |
+
<!-- Cards will be dynamically inserted here -->
|
| 33 |
+
</div>
|
| 34 |
+
|
| 35 |
+
<div class="row mt-2 table-responsive" id="sentimentTable">
|
| 36 |
+
<!-- Sentiment table will be dynamically inserted here -->
|
| 37 |
+
</div>
|
| 38 |
+
</div>
|
| 39 |
+
</div>
|
| 40 |
+
{% endblock %}
|
| 41 |
+
|
| 42 |
+
{% block scripts %}
|
| 43 |
+
<script>
|
| 44 |
+
$(document).ready(function(){
|
| 45 |
+
let progressInterval;
|
| 46 |
+
|
| 47 |
+
$('#classificationForm').on('submit', function(e) {
|
| 48 |
+
e.preventDefault();
|
| 49 |
+
|
| 50 |
+
// Reset and show progress
|
| 51 |
+
$('#progressContainer').show();
|
| 52 |
+
$('#resultsContainer').hide();
|
| 53 |
+
$('#progressBar').css('width', '0%');
|
| 54 |
+
$('#progressText').text('Starting...');
|
| 55 |
+
|
| 56 |
+
// Clear any existing interval
|
| 57 |
+
if (progressInterval) {
|
| 58 |
+
clearInterval(progressInterval);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
// Get form data
|
| 62 |
+
const formData = new FormData(this);
|
| 63 |
+
const progressKey = Date.now().toString();
|
| 64 |
+
formData.append('progress_key', progressKey);
|
| 65 |
+
|
| 66 |
+
// Start progress checking
|
| 67 |
+
progressInterval = setInterval(() => {
|
| 68 |
+
$.get(`/progress/${progressKey}/`, function(data) {
|
| 69 |
+
$('#progressBar').css('width', `${data.percent}%`);
|
| 70 |
+
$('#progressText').text(`${data.stage}...`);
|
| 71 |
+
|
| 72 |
+
if (data.stage === 'complete') {
|
| 73 |
+
clearInterval(progressInterval);
|
| 74 |
+
}
|
| 75 |
+
});
|
| 76 |
+
}, 1000);
|
| 77 |
+
|
| 78 |
+
// Submit form via AJAX
|
| 79 |
+
$.ajax({
|
| 80 |
+
url: window.location.pathname,
|
| 81 |
+
type: 'POST',
|
| 82 |
+
data: formData,
|
| 83 |
+
processData: false,
|
| 84 |
+
contentType: false,
|
| 85 |
+
success: function(response) {
|
| 86 |
+
clearInterval(progressInterval);
|
| 87 |
+
$('#progressContainer').hide();
|
| 88 |
+
$('#resultsContainer').show();
|
| 89 |
+
|
| 90 |
+
// Update results
|
| 91 |
+
$('#category').text(response.classification);
|
| 92 |
+
$('#keywords').text(response.keywords.join(', '));
|
| 93 |
+
$('#priorityScore').text(response.priority_data.priority_score.toFixed(1));
|
| 94 |
+
|
| 95 |
+
// Update priority cards
|
| 96 |
+
const priorityFlags = response.priority_data.priority_flags;
|
| 97 |
+
const cardData = [
|
| 98 |
+
{ title: 'Does it have fact-check news value?', flag: 'fact_check_value', bg: 'bg-primary' },
|
| 99 |
+
{ title: 'Could it cause confusion?', flag: 'cause_confusion', bg: 'bg-secondary' },
|
| 100 |
+
{ title: 'Could it cause chaos?', flag: 'cause_chaos', bg: 'bg-success' },
|
| 101 |
+
{ title: 'Does it affect government?', flag: 'affects_government', bg: 'bg-danger' },
|
| 102 |
+
{ title: 'Immediate economic impact?', flag: 'economic_impact', bg: 'bg-warning' },
|
| 103 |
+
{ title: 'Have laws been broken/bent?', flag: 'law_related', bg: 'bg-info' },
|
| 104 |
+
{ title: 'Is it in the public interest?', flag: 'public_interest', bg: 'bg-light' },
|
| 105 |
+
{ title: 'Are lives in danger?', flag: 'lives_in_danger', bg: 'bg-dark' },
|
| 106 |
+
{ title: 'Is it already viral?', flag: 'viral', bg: 'bg-warning' },
|
| 107 |
+
{ title: 'Is it urgent or time sensitive?', flag: 'urgent', bg: 'bg-success' }
|
| 108 |
+
];
|
| 109 |
+
|
| 110 |
+
let cardsHtml = '';
|
| 111 |
+
cardData.forEach(card => {
|
| 112 |
+
cardsHtml += `
|
| 113 |
+
<div class="col-12 col-sm-6 col-md-4 col-lg-3">
|
| 114 |
+
<div class="card text-white ${card.bg} h-100 shadow">
|
| 115 |
+
<div class="card-body">
|
| 116 |
+
<h5 class="card-title" style="height: 50px;">${card.title}</h5>
|
| 117 |
+
<p class="card-text fs-1">${priorityFlags[card.flag] ? 'Yes' : 'No'}</p>
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
</div>
|
| 121 |
+
`;
|
| 122 |
+
});
|
| 123 |
+
$('#priorityCards').html(cardsHtml);
|
| 124 |
+
|
| 125 |
+
// Update sentiment table if available
|
| 126 |
+
if (response.sentiment_data && response.sentiment_data.table_html) {
|
| 127 |
+
$('#sentimentTable').html(response.sentiment_data.table_html);
|
| 128 |
+
$('#sentimentTable table').DataTable({
|
| 129 |
+
responsive: true
|
| 130 |
+
});
|
| 131 |
+
}
|
| 132 |
+
},
|
| 133 |
+
error: function(xhr) {
|
| 134 |
+
clearInterval(progressInterval);
|
| 135 |
+
$('#progressContainer').hide();
|
| 136 |
+
alert('Error: ' + (xhr.responseJSON?.error || 'An error occurred'));
|
| 137 |
+
}
|
| 138 |
+
});
|
| 139 |
+
});
|
| 140 |
+
});
|
| 141 |
+
</script>
|
| 142 |
+
{% endblock %}
|
ai_api/templates/home-copy.html
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- templates/home.html -->
|
| 2 |
+
{% extends 'base.html' %}
|
| 3 |
+
|
| 4 |
+
{% block title %}Welcome to My Homepage{% endblock %}
|
| 5 |
+
|
| 6 |
+
{% block content %}
|
| 7 |
+
<h1>BERNAMA Fact Check Test Bed!</h1>
|
| 8 |
+
<div class="row col-12 mb-2">
|
| 9 |
+
<div class="card col-3 m-1">
|
| 10 |
+
<div class="card-body">
|
| 11 |
+
<h5 class="card-title">Claim Classification</h5>
|
| 12 |
+
<p class="card-text">Input a claim and submit for AI to classify the statement.</p>
|
| 13 |
+
<a href="/classification" class="btn btn-primary">Test Now</a>
|
| 14 |
+
</div>
|
| 15 |
+
</div>
|
| 16 |
+
<div class="card col-3 m-1">
|
| 17 |
+
<div class="card-body">
|
| 18 |
+
<h5 class="card-title">Image Profiling</h5>
|
| 19 |
+
<p class="card-text">Upload an image for AI to analyze.</p>
|
| 20 |
+
<a href="/image_profiling" class="btn btn-primary">Test Now</a>
|
| 21 |
+
</div>
|
| 22 |
+
</div>
|
| 23 |
+
<div class="card col-3 m-1">
|
| 24 |
+
<div class="card-body">
|
| 25 |
+
<h5 class="card-title">Register New Face</h5>
|
| 26 |
+
<p class="card-text">Insert a person name for AI to learn face recongnition.</p>
|
| 27 |
+
<a href="/register_face" class="btn btn-primary">Test Now</a>
|
| 28 |
+
</div>
|
| 29 |
+
</div>
|
| 30 |
+
<div class="card col-3 m-1">
|
| 31 |
+
<div class="card-body">
|
| 32 |
+
<h5 class="card-title">Transcription</h5>
|
| 33 |
+
<p class="card-text">Audio/Video to transcription (text)</p>
|
| 34 |
+
<a href="/transcription" class="btn btn-primary">Test Now</a>
|
| 35 |
+
</div>
|
| 36 |
+
</div>
|
| 37 |
+
</div>
|
| 38 |
+
{% endblock %}
|
ai_api/templates/home.html
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends 'base.html' %}
|
| 2 |
+
|
| 3 |
+
{% block title %}BERNAMA Fact Check{% endblock %}
|
| 4 |
+
|
| 5 |
+
{% block content %}
|
| 6 |
+
|
| 7 |
+
<!-- Features Section -->
|
| 8 |
+
<section id="features" class="py-5">
|
| 9 |
+
<div class="container">
|
| 10 |
+
<h2 class="text-center fw-bold mb-5 display-6">Core AI Modules</h2>
|
| 11 |
+
<div class="row g-4">
|
| 12 |
+
<!-- Feature Card -->
|
| 13 |
+
<div class="col-12 col-md-6 col-lg-4">
|
| 14 |
+
<a href="/classification" class="text-decoration-none">
|
| 15 |
+
<div class="card h-100 shadow-sm hover-shadow transition">
|
| 16 |
+
<div class="card-body">
|
| 17 |
+
<h5 class="card-title">Claim Classification</h5>
|
| 18 |
+
<p class="card-text text-muted">Input a claim and submit for AI to classify the statement.</p>
|
| 19 |
+
</div>
|
| 20 |
+
</div>
|
| 21 |
+
</a>
|
| 22 |
+
</div>
|
| 23 |
+
|
| 24 |
+
<div class="col-12 col-md-6 col-lg-4">
|
| 25 |
+
<a href="/transcription" class="text-decoration-none">
|
| 26 |
+
<div class="card h-100 shadow-sm hover-shadow transition">
|
| 27 |
+
<div class="card-body">
|
| 28 |
+
<h5 class="card-title">Transcription</h5>
|
| 29 |
+
<p class="card-text text-muted">Convert spoken words into text using advanced speech-to-text models.</p>
|
| 30 |
+
</div>
|
| 31 |
+
</div>
|
| 32 |
+
</a>
|
| 33 |
+
</div>
|
| 34 |
+
|
| 35 |
+
<div class="col-12 col-md-6 col-lg-4">
|
| 36 |
+
<a href="/image_profiling" class="text-decoration-none">
|
| 37 |
+
<div class="card h-100 shadow-sm hover-shadow transition">
|
| 38 |
+
<div class="card-body">
|
| 39 |
+
<h5 class="card-title">Image Processing</h5>
|
| 40 |
+
<p class="card-text text-muted">Image profiling; face detection, metadata, captioning etc.</p>
|
| 41 |
+
</div>
|
| 42 |
+
</div>
|
| 43 |
+
</a>
|
| 44 |
+
</div>
|
| 45 |
+
|
| 46 |
+
<div class="col-12 col-md-6 col-lg-4">
|
| 47 |
+
<a href="/register_face" class="text-decoration-none">
|
| 48 |
+
<div class="card h-100 shadow-sm hover-shadow transition">
|
| 49 |
+
<div class="card-body">
|
| 50 |
+
<h5 class="card-title">Face Register</h5>
|
| 51 |
+
<p class="card-text text-muted">Register new face.</p>
|
| 52 |
+
</div>
|
| 53 |
+
</div>
|
| 54 |
+
</a>
|
| 55 |
+
</div>
|
| 56 |
+
</div>
|
| 57 |
+
</div>
|
| 58 |
+
</section>
|
| 59 |
+
|
| 60 |
+
{% endblock %}
|
ai_api/templates/image_profiling.html
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends 'base.html' %}
|
| 2 |
+
{% block content %}
|
| 3 |
+
<h2 class="mb-4 fw-bold text-white">Image Processing</h2>
|
| 4 |
+
|
| 5 |
+
<form class="mb-4" method="POST" enctype="multipart/form-data">
|
| 6 |
+
{% csrf_token %}
|
| 7 |
+
{{ form.as_p }}
|
| 8 |
+
<button type="submit" class="btn btn-primary">Upload Image</button>
|
| 9 |
+
</form>
|
| 10 |
+
|
| 11 |
+
{% if proccessed %}
|
| 12 |
+
<div class="mt-4">
|
| 13 |
+
<div class="nav nav-tabs" id="myTab" role="tablist">
|
| 14 |
+
<a class="nav-item nav-link active" id="home-tab" data-bs-toggle="tab" href="#home-tab-pane">Uploaded Image</a>
|
| 15 |
+
<a class="nav-item nav-link" id="profile-tab" data-bs-toggle="tab" href="#profile-tab-pane">Face Detects</a>
|
| 16 |
+
<a class="nav-item nav-link" id="contact-tab" data-bs-toggle="tab" href="#contact-tab-pane">OCR Texts</a>
|
| 17 |
+
<a class="nav-item nav-link" id="disabled-tab" data-bs-toggle="tab" href="#disabled-tab-pane">Metadata</a>
|
| 18 |
+
<a class="nav-item nav-link" id="augmentive-tab" data-bs-toggle="tab" href="#augmentive-tab-pane">Augmentive</a>
|
| 19 |
+
</div>
|
| 20 |
+
|
| 21 |
+
<div class="tab-content mt-4">
|
| 22 |
+
<div id="home-tab-pane" class="tab-pane fade show active">
|
| 23 |
+
<img class="img-fluid mx-auto rounded" src="{{ uploaded_base64 }}" alt="Uploaded Image">
|
| 24 |
+
</div>
|
| 25 |
+
|
| 26 |
+
<div id="profile-tab-pane" class="tab-pane fade">
|
| 27 |
+
{% if cropped_faces %}
|
| 28 |
+
<div class="row g-3">
|
| 29 |
+
<div class="col-md-5">
|
| 30 |
+
<h3 class="mt-4 fw-bold">Detected Faces</h3>
|
| 31 |
+
<img class="img-fluid rounded" src="{{ image_with_labels }}" alt="Detected Faces">
|
| 32 |
+
</div>
|
| 33 |
+
|
| 34 |
+
<div class="col-md-7">
|
| 35 |
+
<h3 class="mt-4 fw-bold">Cropped Faces</h3>
|
| 36 |
+
<div class="d-flex flex-wrap gap-4">
|
| 37 |
+
{% for face, face_name, distance, fdescription in cropped_faces %}
|
| 38 |
+
<div class="text-center text-xs" style="width: 80px;">
|
| 39 |
+
<img src="{{ face }}" alt="Cropped Face" class="img-thumbnail img-fluid mb-1">
|
| 40 |
+
<div style="font-size:10px">
|
| 41 |
+
<strong>{{ face_name }}</strong><br>{{ fdescription }}
|
| 42 |
+
</div>
|
| 43 |
+
</div>
|
| 44 |
+
{% endfor %}
|
| 45 |
+
</div>
|
| 46 |
+
</div>
|
| 47 |
+
</div>
|
| 48 |
+
{% endif %}
|
| 49 |
+
</div>
|
| 50 |
+
|
| 51 |
+
<div id="contact-tab-pane" class="tab-pane fade">
|
| 52 |
+
{% if texts %}
|
| 53 |
+
<div class="d-flex flex-wrap gap-2">
|
| 54 |
+
{% for text in texts %}
|
| 55 |
+
<span class="badge bg-success text-white">{{ text }}</span>
|
| 56 |
+
{% endfor %}
|
| 57 |
+
</div>
|
| 58 |
+
{% endif %}
|
| 59 |
+
</div>
|
| 60 |
+
|
| 61 |
+
<div id="disabled-tab-pane" class="tab-pane fade">
|
| 62 |
+
<div class="d-flex flex-wrap gap-4">
|
| 63 |
+
{% if metadata %}
|
| 64 |
+
<div class="w-100">
|
| 65 |
+
<table class="table table-sm table-striped">
|
| 66 |
+
<thead class="table-light">
|
| 67 |
+
<tr>
|
| 68 |
+
<th>IPTC Field</th>
|
| 69 |
+
<th>Value</th>
|
| 70 |
+
</tr>
|
| 71 |
+
</thead>
|
| 72 |
+
<tbody>
|
| 73 |
+
{% for tag, value in metadata.items %}
|
| 74 |
+
<tr>
|
| 75 |
+
<td>{{ tag }}</td>
|
| 76 |
+
<td>{{ value }}</td>
|
| 77 |
+
</tr>
|
| 78 |
+
{% endfor %}
|
| 79 |
+
</tbody>
|
| 80 |
+
</table>
|
| 81 |
+
</div>
|
| 82 |
+
{% endif %}
|
| 83 |
+
|
| 84 |
+
{% if exifs %}
|
| 85 |
+
<div class="w-100">
|
| 86 |
+
<table class="table table-sm table-striped">
|
| 87 |
+
<thead class="table-light">
|
| 88 |
+
<tr>
|
| 89 |
+
<th>EXIF Field</th>
|
| 90 |
+
<th>Value</th>
|
| 91 |
+
</tr>
|
| 92 |
+
</thead>
|
| 93 |
+
<tbody>
|
| 94 |
+
{% for tag, value in exifs.items %}
|
| 95 |
+
<tr>
|
| 96 |
+
<td>{{ tag }}</td>
|
| 97 |
+
<td>{{ value }}</td>
|
| 98 |
+
</tr>
|
| 99 |
+
{% endfor %}
|
| 100 |
+
</tbody>
|
| 101 |
+
</table>
|
| 102 |
+
</div>
|
| 103 |
+
{% endif %}
|
| 104 |
+
</div>
|
| 105 |
+
</div>
|
| 106 |
+
|
| 107 |
+
<div id="augmentive-tab-pane" class="tab-pane fade">
|
| 108 |
+
{% if description %}
|
| 109 |
+
<h3 class="fw-semibold">{{ description }}</h3>
|
| 110 |
+
{% endif %}
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
<div id="reverse-tab-pane" class="tab-pane fade">
|
| 114 |
+
{% if reverse_images %}
|
| 115 |
+
{{ reverse_images }}
|
| 116 |
+
{% endif %}
|
| 117 |
+
</div>
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
|
| 121 |
+
{% endif %}
|
| 122 |
+
{% endblock %}
|
ai_api/templates/register_face.html
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends 'base.html' %}
|
| 2 |
+
{% block content %}
|
| 3 |
+
<h2 class="mb-4 fw-bold text-white">Face Register</h2>
|
| 4 |
+
|
| 5 |
+
<form method="POST" enctype="multipart/form-data" class="mb-4">
|
| 6 |
+
{% csrf_token %}
|
| 7 |
+
<div class="row g-4">
|
| 8 |
+
<div class="col-md-6">
|
| 9 |
+
<label for="{{ form.person.id_for_label }}" class="form-label">
|
| 10 |
+
{{ form.person.label }}
|
| 11 |
+
</label>
|
| 12 |
+
{{ form.person }}
|
| 13 |
+
</div>
|
| 14 |
+
|
| 15 |
+
<div class="col-md-6">
|
| 16 |
+
<label for="{{ form.keywords.id_for_label }}" class="form-label">
|
| 17 |
+
{{ form.keywords.label }}
|
| 18 |
+
</label>
|
| 19 |
+
{{ form.keywords }}
|
| 20 |
+
</div>
|
| 21 |
+
</div>
|
| 22 |
+
|
| 23 |
+
<div class="row g-4">
|
| 24 |
+
<div class="col-md-6">
|
| 25 |
+
<label for="{{ form.images.id_for_label }}" class="form-label">
|
| 26 |
+
{{ form.images.label }}
|
| 27 |
+
</label>
|
| 28 |
+
{{ form.images }}
|
| 29 |
+
</div>
|
| 30 |
+
</div>
|
| 31 |
+
|
| 32 |
+
<button type="submit" class="btn btn-primary mt-2">
|
| 33 |
+
Register
|
| 34 |
+
</button>
|
| 35 |
+
</form>
|
| 36 |
+
|
| 37 |
+
{% if result %}
|
| 38 |
+
<div class="mt-4 bg-light p-4 rounded shadow-sm">
|
| 39 |
+
<p class="mb-0">{{ result }}</p>
|
| 40 |
+
</div>
|
| 41 |
+
{% endif %}
|
| 42 |
+
{% endblock %}
|
ai_api/templates/transcription.html
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends 'base.html' %}
|
| 2 |
+
{% block content %}
|
| 3 |
+
<h2 class="mb-4 fw-bold text-white">Transcription</h2>
|
| 4 |
+
|
| 5 |
+
<form method="post" class="mb-3" id="yt-form" enctype="multipart/form-data">
|
| 6 |
+
{% csrf_token %}
|
| 7 |
+
{{ form.as_p }}
|
| 8 |
+
<input type="hidden" value="{{progress_key}}" name="progress_key">
|
| 9 |
+
<button type="submit" class="btn btn-primary" id="btnSubmit">
|
| 10 |
+
Transcribe
|
| 11 |
+
</button>
|
| 12 |
+
</form>
|
| 13 |
+
|
| 14 |
+
<!-- Progress Bar -->
|
| 15 |
+
<div class="progress mb-4 d-none" id="progress-container">
|
| 16 |
+
<div class="progress-bar progress-bar-striped progress-bar-animated"
|
| 17 |
+
role="progressbar"
|
| 18 |
+
aria-valuenow="0"
|
| 19 |
+
aria-valuemin="0"
|
| 20 |
+
aria-valuemax="100"
|
| 21 |
+
style="width: 0%">
|
| 22 |
+
</div>
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
<!-- Transcription Result -->
|
| 26 |
+
<div id="transcription" class="d-none">
|
| 27 |
+
<div class="bg-light p-4 rounded shadow-sm">
|
| 28 |
+
<div class="container"></div>
|
| 29 |
+
</div>
|
| 30 |
+
</div>
|
| 31 |
+
{% endblock %}
|
| 32 |
+
|
| 33 |
+
{% block scripts %}
|
| 34 |
+
<script src="https://rawcdn.githack.com/mozilla/vtt.js/master/dist/vtt.min.js"></script>
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
<script>
|
| 39 |
+
function copyToClipboard(selector) {
|
| 40 |
+
const text = $(selector).text(); // Get innerText
|
| 41 |
+
navigator.clipboard.writeText(text)
|
| 42 |
+
.then(() => {
|
| 43 |
+
// console.log('Copied to clipboard:', text);
|
| 44 |
+
})
|
| 45 |
+
.catch(err => {
|
| 46 |
+
console.error('Failed to copy:', err);
|
| 47 |
+
});
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
function ucfirst(str) {
|
| 51 |
+
if (!str) return '';
|
| 52 |
+
return str.charAt(0).toUpperCase() + str.slice(1);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
$(document).ready(function () {
|
| 56 |
+
$('#yt-form').on('submit', function (e) {
|
| 57 |
+
e.preventDefault();
|
| 58 |
+
|
| 59 |
+
$('#btnSubmit').text('Downloading...');
|
| 60 |
+
$('#btnSubmit').prop('disabled', true);
|
| 61 |
+
|
| 62 |
+
$('#progress-container').removeClass('d-none');
|
| 63 |
+
const $bar = $('.progress-bar');
|
| 64 |
+
const key = '{{ progress_key }}';
|
| 65 |
+
var formData = new FormData(this);
|
| 66 |
+
|
| 67 |
+
$.ajax({
|
| 68 |
+
url: '.',
|
| 69 |
+
type: 'POST',
|
| 70 |
+
data: formData,
|
| 71 |
+
processData: false,
|
| 72 |
+
contentType: false,
|
| 73 |
+
success: function (response) {
|
| 74 |
+
if (response.segments) {
|
| 75 |
+
$('#transcription').removeClass('d-none');
|
| 76 |
+
|
| 77 |
+
$('#progress-container').removeClass('d-none');
|
| 78 |
+
$('#transcription .container').empty(); // Clear previous content
|
| 79 |
+
|
| 80 |
+
// Insert audio HTML
|
| 81 |
+
$('#transcription .container').append(response.audio_file);
|
| 82 |
+
|
| 83 |
+
// Add subtitle box
|
| 84 |
+
const subtitleBox = $('<div id="subtitleBox" style="padding:1em;background:#222;color:white;margin-top:10px;min-height:40px;"></div>')
|
| 85 |
+
.text("Play the audio");
|
| 86 |
+
$('#transcription .container').append(subtitleBox);
|
| 87 |
+
|
| 88 |
+
// Get the audio file URL from the HTML string
|
| 89 |
+
const audioSrcMatch = response.audio_file.match(/src="([^"]+)"/);
|
| 90 |
+
if (!audioSrcMatch) return;
|
| 91 |
+
|
| 92 |
+
const audioUrl = audioSrcMatch[1]; // /media/uploads/file.wav
|
| 93 |
+
const vttUrl = audioUrl.replace('/uploads/', '/vtt/').replace(/\.\w+$/, '.vtt'); // change extension to .vtt
|
| 94 |
+
|
| 95 |
+
// Load and parse the VTT file using vtt.js
|
| 96 |
+
const audio = document.querySelector('#transcription audio');
|
| 97 |
+
let cues = [];
|
| 98 |
+
|
| 99 |
+
fetch(vttUrl)
|
| 100 |
+
.then(res => res.text())
|
| 101 |
+
.then(vttData => {
|
| 102 |
+
const parser = new WebVTT.Parser(window, WebVTT.StringDecoder());
|
| 103 |
+
parser.oncue = function (cue) {
|
| 104 |
+
cues.push(cue);
|
| 105 |
+
};
|
| 106 |
+
parser.parse(vttData);
|
| 107 |
+
parser.flush();
|
| 108 |
+
});
|
| 109 |
+
|
| 110 |
+
audio.addEventListener('timeupdate', () => {
|
| 111 |
+
const currentTime = audio.currentTime;
|
| 112 |
+
const activeCue = cues.find(cue => currentTime >= cue.startTime && currentTime <= cue.endTime);
|
| 113 |
+
document.getElementById('subtitleBox').textContent = activeCue ? activeCue.text : '';
|
| 114 |
+
});
|
| 115 |
+
|
| 116 |
+
$('<div class="accordion">\
|
| 117 |
+
<div class="accordion-item">\
|
| 118 |
+
<h2 class="accordion-header" id="headingOne">\
|
| 119 |
+
<button class="accordion-button" type="button" data-bs-toggle="collapse" data-bs-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">\
|
| 120 |
+
Full Transcription \
|
| 121 |
+
</button>\
|
| 122 |
+
</h2>\
|
| 123 |
+
<div id="collapseOne" class="accordion-collapse collapse show" aria-labelledby="headingOne" data-bs-parent="#accordionExample">\
|
| 124 |
+
<div class="accordion-body">\
|
| 125 |
+
<div class="float-end"> <a href="'+vttUrl+'" download class="btn btn-sm btn-info me-1" title="Download"> <i class="fa fa-download"></i></a><button class="btn btn-sm me-1 btn-info" title="Copy" onClick="copyToClipboard(\'#segments\')"> <i class="fa fa-clipboard"></i></button></div>\
|
| 126 |
+
<div class="mt-3" id="segments"></div>\
|
| 127 |
+
</div>\
|
| 128 |
+
</div>\
|
| 129 |
+
</div>\
|
| 130 |
+
</div>').appendTo('#transcription .container');
|
| 131 |
+
|
| 132 |
+
$.each(response.segments, function(index, segment) {
|
| 133 |
+
var pElement = $('<pre></pre>').text(segment.text);
|
| 134 |
+
$('#segments').append(pElement);
|
| 135 |
+
});
|
| 136 |
+
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
});
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
const interval = setInterval(function () {
|
| 143 |
+
$.getJSON(`/progress/${key}/`, function (data) {
|
| 144 |
+
$bar.css('width', data.percent + '%');
|
| 145 |
+
$bar.attr('aria-valuenow', data.percent);
|
| 146 |
+
// $bar.html(data.percent + '%');
|
| 147 |
+
$('#btnSubmit').text(ucfirst(data.stage) + '...');
|
| 148 |
+
|
| 149 |
+
if (data.stage === 'done') {
|
| 150 |
+
$('#btnSubmit').prop('disabled', false).text('Transcribe');
|
| 151 |
+
clearInterval(interval);
|
| 152 |
+
$('#progress-container').addClass('d-none');
|
| 153 |
+
}
|
| 154 |
+
});
|
| 155 |
+
}, 1000);
|
| 156 |
+
});
|
| 157 |
+
});
|
| 158 |
+
</script>
|
| 159 |
+
{% endblock %}
|
ai_api/tests.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.test import TestCase
|
| 2 |
+
|
| 3 |
+
# Create your tests here.
|
ai_api/urls.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.urls import path
|
| 2 |
+
from . import views
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
urlpatterns = [
|
| 6 |
+
path('', views.home, name='home'),
|
| 7 |
+
path('classification/', views.classification, name='classification'),
|
| 8 |
+
path('image_profiling/', views.image_profiling, name='image_profiling'),
|
| 9 |
+
path('register_face/', views.register_face, name='register_face'),
|
| 10 |
+
path('transcription/', views.transcription, name='transcription'),
|
| 11 |
+
path('progress/<str:key>/', views.check_progress, name='check_progress'),
|
| 12 |
+
]
|
ai_api/views.py
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.shortcuts import render
|
| 2 |
+
from django.http import JsonResponse
|
| 3 |
+
from .forms import ImageUploadForm, ClassificationForm, RegisterFaceForm,TranscribeForm, YouTubeURLForm
|
| 4 |
+
import shutil
|
| 5 |
+
from django.conf import settings
|
| 6 |
+
import torch
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
from PIL import Image as PILImage
|
| 10 |
+
import io
|
| 11 |
+
import tempfile
|
| 12 |
+
from django.core.cache import cache
|
| 13 |
+
import numpy as numpy_lib
|
| 14 |
+
import pickle
|
| 15 |
+
from deepface import DeepFace
|
| 16 |
+
import cv2
|
| 17 |
+
import base64
|
| 18 |
+
from io import BytesIO
|
| 19 |
+
from . import globals
|
| 20 |
+
import tempfile
|
| 21 |
+
import mimetypes
|
| 22 |
+
import subprocess
|
| 23 |
+
import logging
|
| 24 |
+
import uuid
|
| 25 |
+
import yt_dlp
|
| 26 |
+
import time
|
| 27 |
+
import re
|
| 28 |
+
from pydub import AudioSegment
|
| 29 |
+
import pandas as pd
|
| 30 |
+
import csv
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Setup logging for error handling
|
| 34 |
+
logger = logging.getLogger(__name__)
|
| 35 |
+
|
| 36 |
+
# from ai_api.library.devlab_image import DevLabImage
|
| 37 |
+
|
| 38 |
+
# devlab_image = DevLabImage()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
model = globals.model
|
| 42 |
+
tokenizer = globals.tokenizer
|
| 43 |
+
devlab_image = globals.devlab_image
|
| 44 |
+
|
| 45 |
+
with open(f"{globals.save_path}/label_map.json", "r") as f:
|
| 46 |
+
label_map = json.load(f)
|
| 47 |
+
|
| 48 |
+
index_to_label = {v: k for k, v in label_map.items()}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# Create your views here.
|
| 52 |
+
def home(request):
|
| 53 |
+
return render(request, 'home.html')
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def classification(request):
|
| 57 |
+
from .library import simple_keyword_extraction, apify_scraper, priority_indexer, websearch, lowyat_crawler, sentiment_analyzer
|
| 58 |
+
|
| 59 |
+
if request.method == 'POST':
|
| 60 |
+
progress_key = request.POST.get("progress_key", str(uuid.uuid4()))
|
| 61 |
+
cache.set(progress_key, {'stage': 'starting', 'percent': 0})
|
| 62 |
+
|
| 63 |
+
text = request.POST.get("claim", "")
|
| 64 |
+
if not text:
|
| 65 |
+
return JsonResponse({"error": "No text provided"}, status=400)
|
| 66 |
+
|
| 67 |
+
claim_id = str(uuid.uuid4())[:8]
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
# Step 1: Classification
|
| 71 |
+
cache.set(progress_key, {'stage': 'classifying', 'percent': 10})
|
| 72 |
+
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
|
| 73 |
+
with torch.no_grad():
|
| 74 |
+
outputs = model(**inputs)
|
| 75 |
+
prediction = torch.argmax(outputs.logits, dim=-1).item()
|
| 76 |
+
classification_result = index_to_label.get(prediction, "Unknown")
|
| 77 |
+
|
| 78 |
+
# Step 2: Keyword Extraction
|
| 79 |
+
cache.set(progress_key, {'stage': 'extracting_keywords', 'percent': 20})
|
| 80 |
+
keywords = simple_keyword_extraction.extract_keywords(text)
|
| 81 |
+
|
| 82 |
+
# Step 3: Setup paths
|
| 83 |
+
output_path = os.path.join(settings.BASE_DIR, 'ai_api', 'library', 'output')
|
| 84 |
+
report_path = os.path.join(settings.BASE_DIR, 'ai_api', 'library', 'reports')
|
| 85 |
+
raw_data_path = os.path.join(output_path, f'{claim_id}.csv')
|
| 86 |
+
|
| 87 |
+
# Step 4: Run TikTok scraper
|
| 88 |
+
cache.set(progress_key, {'stage': 'scraping_tiktok', 'percent': 30})
|
| 89 |
+
apify_scraper.run(
|
| 90 |
+
keywords,
|
| 91 |
+
output_path=raw_data_path,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Step 5: Run web search
|
| 95 |
+
cache.set(progress_key, {'stage': 'searching_web', 'percent': 50})
|
| 96 |
+
web_search_results = websearch.run(
|
| 97 |
+
keywords,
|
| 98 |
+
output_path=os.path.join(output_path, f"{claim_id}_web.json"),
|
| 99 |
+
full_claim=text
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Step 6: Run Lowyat forum crawler
|
| 103 |
+
cache.set(progress_key, {'stage': 'crawling_forum', 'percent': 60})
|
| 104 |
+
lowyat_path = os.path.join(output_path, f"{claim_id}_lowyat.csv")
|
| 105 |
+
lowyat_sections = ["Kopitiam", "SeriousKopitiam"]
|
| 106 |
+
lowyat_results = lowyat_crawler.run(
|
| 107 |
+
keywords,
|
| 108 |
+
sections=lowyat_sections,
|
| 109 |
+
output_path=lowyat_path,
|
| 110 |
+
full_claim=text
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Step 7: Combine datasets
|
| 114 |
+
cache.set(progress_key, {'stage': 'combining_data', 'percent': 70})
|
| 115 |
+
if os.path.exists(lowyat_path):
|
| 116 |
+
lowyat_df = pd.read_csv(lowyat_path)
|
| 117 |
+
if os.path.exists(raw_data_path):
|
| 118 |
+
main_df = pd.read_csv(raw_data_path)
|
| 119 |
+
combined_df = pd.concat([main_df, lowyat_df], ignore_index=True)
|
| 120 |
+
combined_df.to_csv(raw_data_path, index=False)
|
| 121 |
+
else:
|
| 122 |
+
lowyat_df.to_csv(raw_data_path, index=False)
|
| 123 |
+
|
| 124 |
+
# Step 8: Run sentiment analysis
|
| 125 |
+
cache.set(progress_key, {'stage': 'analyzing_sentiment', 'percent': 80})
|
| 126 |
+
sentiment_csv = os.path.join(output_path, f"{claim_id}_sentiment.csv")
|
| 127 |
+
sentiment_data = {}
|
| 128 |
+
|
| 129 |
+
if os.path.exists(raw_data_path):
|
| 130 |
+
sentiment_analyzer.run(raw_data_path, sentiment_csv)
|
| 131 |
+
|
| 132 |
+
if os.path.exists(sentiment_csv):
|
| 133 |
+
sentiment_df = pd.read_csv(sentiment_csv)
|
| 134 |
+
sentiment_counts = sentiment_df['sentiment'].value_counts().to_dict()
|
| 135 |
+
sentiment_map = {0: "neutral", 1: "positive", 2: "negative"}
|
| 136 |
+
text_counts = {sentiment_map.get(k, k): v for k, v in sentiment_counts.items()}
|
| 137 |
+
sentiment_data = {
|
| 138 |
+
'counts': text_counts,
|
| 139 |
+
'table_html': csv_to_html_table(sentiment_csv)
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
# Step 9: Run priority indexing
|
| 143 |
+
cache.set(progress_key, {'stage': 'indexing_priority', 'percent': 90})
|
| 144 |
+
priority_json = os.path.join(report_path, f"{claim_id}_priority.json")
|
| 145 |
+
priority_data = {}
|
| 146 |
+
|
| 147 |
+
if os.path.exists(sentiment_csv):
|
| 148 |
+
priority_indexer.run(
|
| 149 |
+
claim=text,
|
| 150 |
+
claim_id=claim_id,
|
| 151 |
+
keywords=keywords,
|
| 152 |
+
sentiment_csv=sentiment_csv,
|
| 153 |
+
output_path=priority_json
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if os.path.exists(priority_json):
|
| 157 |
+
with open(priority_json, 'r') as f:
|
| 158 |
+
priority_data = json.load(f)
|
| 159 |
+
verdict = determine_verdict(priority_data)
|
| 160 |
+
|
| 161 |
+
# Step 10: Complete
|
| 162 |
+
cache.set(progress_key, {'stage': 'complete', 'percent': 100})
|
| 163 |
+
|
| 164 |
+
return JsonResponse({
|
| 165 |
+
'classification': classification_result,
|
| 166 |
+
'keywords': keywords,
|
| 167 |
+
'sentiment_data': sentiment_data,
|
| 168 |
+
'priority_data': priority_data,
|
| 169 |
+
'verdict': verdict if 'verdict' in locals() else "UNVERIFIED",
|
| 170 |
+
'progress_key': progress_key
|
| 171 |
+
})
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Error in classification: {str(e)}")
|
| 175 |
+
return JsonResponse({
|
| 176 |
+
'error': str(e),
|
| 177 |
+
'progress_key': progress_key
|
| 178 |
+
}, status=500)
|
| 179 |
+
|
| 180 |
+
else:
|
| 181 |
+
form = ClassificationForm()
|
| 182 |
+
return render(request, 'classification.html', {
|
| 183 |
+
'form': form,
|
| 184 |
+
'result': {}
|
| 185 |
+
})
|
| 186 |
+
|
| 187 |
+
def determine_verdict(priority_data):
|
| 188 |
+
"""Determine verdict based on priority data"""
|
| 189 |
+
# Extract priority flags from the data
|
| 190 |
+
if isinstance(priority_data, dict):
|
| 191 |
+
if "priority_flags" in priority_data:
|
| 192 |
+
priority_flags = priority_data["priority_flags"]
|
| 193 |
+
else:
|
| 194 |
+
# Assume the dictionary itself contains the flags
|
| 195 |
+
priority_flags = priority_data
|
| 196 |
+
else:
|
| 197 |
+
return "UNVERIFIED"
|
| 198 |
+
|
| 199 |
+
# Get sentiment counts if available
|
| 200 |
+
sentiment_counts = {}
|
| 201 |
+
if "sentiment_counts" in priority_data:
|
| 202 |
+
sentiment_counts = priority_data["sentiment_counts"]
|
| 203 |
+
# Convert keys to strings if they're not already
|
| 204 |
+
if any(not isinstance(k, str) for k in sentiment_counts.keys()):
|
| 205 |
+
sentiment_counts = {str(k): v for k, v in sentiment_counts.items()}
|
| 206 |
+
|
| 207 |
+
# Get priority score if available
|
| 208 |
+
priority_score = priority_data.get("priority_score", sum(priority_flags.values()))
|
| 209 |
+
|
| 210 |
+
# Get claim and keywords
|
| 211 |
+
claim = priority_data.get("claim", "").lower()
|
| 212 |
+
keywords = priority_data.get("keywords", [])
|
| 213 |
+
keywords_lower = [k.lower() for k in keywords]
|
| 214 |
+
|
| 215 |
+
# Check for specific claim patterns
|
| 216 |
+
is_azan_claim = any(word in claim for word in ["azan", "larang", "masjid", "pembesar suara"])
|
| 217 |
+
is_religious_claim = any(word in claim for word in ["islam", "agama", "masjid", "surau", "sembahyang", "solat", "zakat"])
|
| 218 |
+
|
| 219 |
+
# Check for economic impact
|
| 220 |
+
economic_related = priority_flags.get("economic_impact", 0) == 1
|
| 221 |
+
|
| 222 |
+
# Check for government involvement
|
| 223 |
+
government_related = priority_flags.get("affects_government", 0) == 1
|
| 224 |
+
|
| 225 |
+
# Check for law-related content
|
| 226 |
+
law_related = priority_flags.get("law_related", 0) == 1
|
| 227 |
+
|
| 228 |
+
# Check for confusion potential
|
| 229 |
+
causes_confusion = priority_flags.get("cause_confusion", 0) == 1
|
| 230 |
+
|
| 231 |
+
# Check for negative sentiment dominance
|
| 232 |
+
negative_dominant = False
|
| 233 |
+
if sentiment_counts:
|
| 234 |
+
pos = int(sentiment_counts.get("positive", sentiment_counts.get("1", 0)))
|
| 235 |
+
neg = int(sentiment_counts.get("negative", sentiment_counts.get("2", 0)))
|
| 236 |
+
neu = int(sentiment_counts.get("neutral", sentiment_counts.get("0", 0)))
|
| 237 |
+
negative_dominant = neg > pos and neg > neu
|
| 238 |
+
|
| 239 |
+
# Special case for azan claim (like the example provided)
|
| 240 |
+
if is_azan_claim and is_religious_claim and "larangan" in claim:
|
| 241 |
+
return "FALSE" # Claim about banning azan is false
|
| 242 |
+
|
| 243 |
+
# Determine verdict based on multiple factors
|
| 244 |
+
if priority_score >= 7.0 and negative_dominant and (government_related or law_related):
|
| 245 |
+
return "FALSE"
|
| 246 |
+
elif priority_score >= 5.0 and causes_confusion:
|
| 247 |
+
return "PARTIALLY_TRUE"
|
| 248 |
+
elif priority_score <= 3.0 and not negative_dominant:
|
| 249 |
+
return "TRUE"
|
| 250 |
+
elif economic_related and government_related:
|
| 251 |
+
# Special case for economic policies by government
|
| 252 |
+
if negative_dominant:
|
| 253 |
+
return "FALSE"
|
| 254 |
+
elif causes_confusion:
|
| 255 |
+
return "PARTIALLY_TRUE"
|
| 256 |
+
else:
|
| 257 |
+
return "TRUE"
|
| 258 |
+
else:
|
| 259 |
+
return "UNVERIFIED"
|
| 260 |
+
|
| 261 |
+
def image_profiling(request):
|
| 262 |
+
# import faiss
|
| 263 |
+
|
| 264 |
+
result = None
|
| 265 |
+
image_with_labels = None
|
| 266 |
+
cropped_faces_base64 = []
|
| 267 |
+
texts = None
|
| 268 |
+
proccessed = False
|
| 269 |
+
uploded_base64 = None
|
| 270 |
+
exifs = None
|
| 271 |
+
metadata = None
|
| 272 |
+
description = None
|
| 273 |
+
reverse_images = None
|
| 274 |
+
|
| 275 |
+
if request.method == 'POST':
|
| 276 |
+
form = ImageUploadForm(request.POST, request.FILES)
|
| 277 |
+
if form.is_valid():
|
| 278 |
+
proccessed = True
|
| 279 |
+
uploaded_image = request.FILES['image']
|
| 280 |
+
|
| 281 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
|
| 282 |
+
for chunk in uploaded_image.chunks():
|
| 283 |
+
tmp.write(chunk)
|
| 284 |
+
tmp_path = tmp.name
|
| 285 |
+
|
| 286 |
+
image = PILImage.open(uploaded_image)
|
| 287 |
+
image_np = numpy_lib.array(image.convert('RGB'))
|
| 288 |
+
exifs = devlab_image.extract_exif(tmp_path)
|
| 289 |
+
metadata = devlab_image.extract_metadata_exiftool(tmp_path)
|
| 290 |
+
description = devlab_image.generate_description_blip(tmp_path)
|
| 291 |
+
# reverse_images = devlab_image.reverse_search(tmp_path)
|
| 292 |
+
|
| 293 |
+
buffered = io.BytesIO()
|
| 294 |
+
image.save(buffered, format="PNG") # or "JPEG", depending on your image format
|
| 295 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 296 |
+
uploded_base64 = f"data:image/png;base64,{img_str}"
|
| 297 |
+
|
| 298 |
+
texts = devlab_image.extract_text_numpy(image_np)
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
# Detect face embeddings using DeepFace
|
| 302 |
+
face_embeddings = DeepFace.represent(image_np, model_name="Facenet", enforce_detection=False)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
if not face_embeddings:
|
| 306 |
+
return "❌ No faces detected in the image."
|
| 307 |
+
|
| 308 |
+
recognized_faces = {}
|
| 309 |
+
cropped_faces = []
|
| 310 |
+
|
| 311 |
+
for face_data in face_embeddings:
|
| 312 |
+
query_embedding = numpy_lib.array(face_data["embedding"], dtype=numpy_lib.float32).reshape(1, -1)
|
| 313 |
+
|
| 314 |
+
results = devlab_image.query_embedding(query_embedding,1)
|
| 315 |
+
if results and len(results) > 0 and len(results[0]) > 0:
|
| 316 |
+
entity = results[0][0].entity
|
| 317 |
+
print(f"Entity: {entity}") # See what fields are present in the entity
|
| 318 |
+
|
| 319 |
+
face_name = entity.get('name') if entity else 'Unknown'
|
| 320 |
+
fdescription = entity.get('short_description') if entity else ''
|
| 321 |
+
if fdescription is None:
|
| 322 |
+
fdescription = ''
|
| 323 |
+
|
| 324 |
+
distance = round(results[0][0].distance, 4)
|
| 325 |
+
|
| 326 |
+
if distance*100>95:
|
| 327 |
+
face_name = f"{face_name} (CLOSEST)"
|
| 328 |
+
# Store recognized face data
|
| 329 |
+
recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {
|
| 330 |
+
"name": face_name,
|
| 331 |
+
"distance": distance,
|
| 332 |
+
"description": fdescription,
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
# Face location for drawing rectangle and adding label
|
| 336 |
+
face_location = face_data["facial_area"]
|
| 337 |
+
x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
|
| 338 |
+
|
| 339 |
+
# Draw rectangle and label on the image
|
| 340 |
+
# cv2.putText(image_np, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
|
| 341 |
+
cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
| 342 |
+
|
| 343 |
+
# Crop the detected face and prepare it for displaying
|
| 344 |
+
cropped_face = image_np[y:y + h, x:x + w]
|
| 345 |
+
cropped_faces.append([cropped_face, face_name, distance, fdescription])
|
| 346 |
+
|
| 347 |
+
# label = f"{face_name} (Dist: {round(distance, 2)})"
|
| 348 |
+
|
| 349 |
+
else:
|
| 350 |
+
print('No result found')
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
# Convert the image with labels to base64 for HTML rendering
|
| 355 |
+
_, buffer = cv2.imencode('.png', image_np)
|
| 356 |
+
image_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 357 |
+
|
| 358 |
+
# Convert cropped faces to base64 for displaying in template
|
| 359 |
+
cropped_faces_base64 = []
|
| 360 |
+
for face, face_name, distance, fdescription in cropped_faces:
|
| 361 |
+
_, buffer = cv2.imencode('.png', face)
|
| 362 |
+
face_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 363 |
+
cropped_faces_base64.append([f"data:image/png;base64,{face_base64}",face_name, distance, fdescription])
|
| 364 |
+
|
| 365 |
+
# Prepare result for template rendering
|
| 366 |
+
result = recognized_faces
|
| 367 |
+
image_with_labels = f"data:image/png;base64,{image_base64}"
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
else:
|
| 371 |
+
form = ImageUploadForm()
|
| 372 |
+
|
| 373 |
+
return render(request, 'image_profiling.html', {
|
| 374 |
+
'form': form,
|
| 375 |
+
'proccessed' : proccessed,
|
| 376 |
+
'uploaded_base64': uploded_base64,
|
| 377 |
+
'image_with_labels': image_with_labels,
|
| 378 |
+
'cropped_faces': cropped_faces_base64,
|
| 379 |
+
'texts': texts,
|
| 380 |
+
'exifs': exifs,
|
| 381 |
+
'metadata': metadata,
|
| 382 |
+
'description': description,
|
| 383 |
+
'reverse_images': reverse_images
|
| 384 |
+
})
|
| 385 |
+
|
| 386 |
+
# def detect_faces2(request):
|
| 387 |
+
# import faiss
|
| 388 |
+
# import numpy as np
|
| 389 |
+
# import pickle
|
| 390 |
+
# from deepface import DeepFace
|
| 391 |
+
# import cv2
|
| 392 |
+
# import base64
|
| 393 |
+
# from io import BytesIO
|
| 394 |
+
# from PIL import Image
|
| 395 |
+
# import os
|
| 396 |
+
|
| 397 |
+
# result = None
|
| 398 |
+
# image_with_labels = None
|
| 399 |
+
# cropped_faces_base64 = []
|
| 400 |
+
|
| 401 |
+
# if request.method == 'POST':
|
| 402 |
+
# form = ImageUploadForm(request.POST, request.FILES)
|
| 403 |
+
# if form.is_valid():
|
| 404 |
+
# uploaded_image = request.FILES['image']
|
| 405 |
+
|
| 406 |
+
# # Open the uploaded image with Pillow and convert to RGB
|
| 407 |
+
# image = Image.open(uploaded_image).convert('RGB')
|
| 408 |
+
# image_np = numpy_lib.array(image)
|
| 409 |
+
|
| 410 |
+
# # Load FAISS index and metadata
|
| 411 |
+
# save_path = os.path.join(os.path.dirname(__file__), "deepface")
|
| 412 |
+
# try:
|
| 413 |
+
# index = faiss.read_index(save_path + "/faiss_hnsw_index.bin")
|
| 414 |
+
# with open(save_path + "/metadata.pkl", "rb") as f:
|
| 415 |
+
# names = pickle.load(f)
|
| 416 |
+
# except Exception as e:
|
| 417 |
+
# return f"Error loading FAISS index or metadata: {str(e)}"
|
| 418 |
+
|
| 419 |
+
# # Set search parameters for better accuracy in FAISS
|
| 420 |
+
# index.hnsw.efSearch = 100 # Larger = better accuracy, but slower
|
| 421 |
+
|
| 422 |
+
# # Detect face embeddings using DeepFace
|
| 423 |
+
# face_embeddings = DeepFace.represent(image_np, model_name="Facenet", enforce_detection=False)
|
| 424 |
+
|
| 425 |
+
# if not face_embeddings:
|
| 426 |
+
# return "❌ No faces detected in the image."
|
| 427 |
+
|
| 428 |
+
# recognized_faces = {}
|
| 429 |
+
# cropped_faces = []
|
| 430 |
+
|
| 431 |
+
# for face_data in face_embeddings:
|
| 432 |
+
# query_embedding = numpy_lib.array(face_data["embedding"], dtype=numpy_lib.float32).reshape(1, -1)
|
| 433 |
+
|
| 434 |
+
# # Search for the closest matches in the FAISS index
|
| 435 |
+
# D, I = index.search(query_embedding, 1) # D = distances, I = indices
|
| 436 |
+
|
| 437 |
+
# # Get the top match for this face
|
| 438 |
+
# face_name = names[I[0][0]]
|
| 439 |
+
# distance = D[0][0]
|
| 440 |
+
|
| 441 |
+
# # Store recognized face data
|
| 442 |
+
# recognized_faces[f"clip_{len(recognized_faces) + 1}"] = {
|
| 443 |
+
# "name": face_name,
|
| 444 |
+
# "distance": round(distance, 4)
|
| 445 |
+
# }
|
| 446 |
+
|
| 447 |
+
# # Face location for drawing rectangle and adding label
|
| 448 |
+
# face_location = face_data["facial_area"]
|
| 449 |
+
# x, y, w, h = face_location["x"], face_location["y"], face_location["w"], face_location["h"]
|
| 450 |
+
|
| 451 |
+
# # Draw rectangle and label on the image
|
| 452 |
+
# # cv2.putText(image_np, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
|
| 453 |
+
# cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
| 454 |
+
|
| 455 |
+
# # Crop the detected face and prepare it for displaying
|
| 456 |
+
# cropped_face = image_np[y:y + h, x:x + w]
|
| 457 |
+
# cropped_faces.append([cropped_face, face_name])
|
| 458 |
+
|
| 459 |
+
# label = f"{face_name} (Dist: {round(distance, 4)})"
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
# # Convert the image with labels to base64 for HTML rendering
|
| 464 |
+
# _, buffer = cv2.imencode('.png', image_np)
|
| 465 |
+
# image_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 466 |
+
|
| 467 |
+
# # Convert cropped faces to base64 for displaying in template
|
| 468 |
+
# cropped_faces_base64 = []
|
| 469 |
+
# for face,fname in cropped_faces:
|
| 470 |
+
# _, buffer = cv2.imencode('.png', face)
|
| 471 |
+
# face_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 472 |
+
# cropped_faces_base64.append([f"data:image/png;base64,{face_base64}",fname])
|
| 473 |
+
|
| 474 |
+
# # Prepare result for template rendering
|
| 475 |
+
# result = recognized_faces
|
| 476 |
+
# image_with_labels = f"data:image/png;base64,{image_base64}"
|
| 477 |
+
|
| 478 |
+
# else:
|
| 479 |
+
# form = ImageUploadForm()
|
| 480 |
+
|
| 481 |
+
# return render(request, 'face_detection.html', {
|
| 482 |
+
# 'form': form,
|
| 483 |
+
# 'result': result,
|
| 484 |
+
# 'image_with_labels': image_with_labels,
|
| 485 |
+
# 'cropped_faces': cropped_faces_base64 # Pass the list of cropped faces to the template
|
| 486 |
+
# })
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def register_face(request):
|
| 490 |
+
from ai_api.library.devlab_image import DevLabImage
|
| 491 |
+
import os
|
| 492 |
+
from django.core.files.storage import FileSystemStorage
|
| 493 |
+
from django.conf import settings
|
| 494 |
+
|
| 495 |
+
result = None
|
| 496 |
+
if request.method == 'POST':
|
| 497 |
+
form = RegisterFaceForm(request.POST)
|
| 498 |
+
person = request.POST.get("person", "").upper()
|
| 499 |
+
keywords = request.POST.get("keywords", "")
|
| 500 |
+
files = request.FILES.getlist('images')
|
| 501 |
+
|
| 502 |
+
devlab_image = DevLabImage()
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
if files:
|
| 506 |
+
print('Upload manual')
|
| 507 |
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 508 |
+
upload_dir = os.path.join(project_root, 'people', person)
|
| 509 |
+
|
| 510 |
+
print(f"Saving to: {upload_dir}")
|
| 511 |
+
os.makedirs(upload_dir, exist_ok=True)
|
| 512 |
+
|
| 513 |
+
fs = FileSystemStorage(location=upload_dir)
|
| 514 |
+
|
| 515 |
+
for file in files:
|
| 516 |
+
filename = fs.save(file.name, file)
|
| 517 |
+
file_url = fs.url(filename)
|
| 518 |
+
print(f"Saved: {file_url}")
|
| 519 |
+
devlab_image.extract_face( person, keywords)
|
| 520 |
+
else:
|
| 521 |
+
print('Download from Google')
|
| 522 |
+
devlab_image.register_person(person, keywords)
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
else:
|
| 526 |
+
form = RegisterFaceForm()
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
return render(request, 'register_face.html', {
|
| 530 |
+
'form': form,
|
| 531 |
+
'result': result,
|
| 532 |
+
})
|
| 533 |
+
|
| 534 |
+
def check_progress(request, key):
|
| 535 |
+
# print(f"getting progress key {key}")
|
| 536 |
+
progress = cache.get(key, {'stage': 'downloading', 'percent': 0})
|
| 537 |
+
# print(progress)
|
| 538 |
+
return JsonResponse(progress)
|
| 539 |
+
|
| 540 |
+
def handle_uploaded_file(file):
|
| 541 |
+
mime_type, _ = mimetypes.guess_type(file.name)
|
| 542 |
+
|
| 543 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
|
| 544 |
+
output_audio_file = temp_audio_file.name
|
| 545 |
+
|
| 546 |
+
if mime_type and mime_type.startswith('video'):
|
| 547 |
+
# Save video temporarily
|
| 548 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[-1]) as temp_video_file:
|
| 549 |
+
for chunk in file.chunks():
|
| 550 |
+
temp_video_file.write(chunk)
|
| 551 |
+
video_path = temp_video_file.name
|
| 552 |
+
|
| 553 |
+
# Extract audio using ffmpeg
|
| 554 |
+
command = [
|
| 555 |
+
'ffmpeg',
|
| 556 |
+
'-y',
|
| 557 |
+
'-i', video_path,
|
| 558 |
+
'-vn', # no video
|
| 559 |
+
'-acodec', 'pcm_s16le', # WAV format
|
| 560 |
+
'-ar', '16000', # 16 kHz sample rate
|
| 561 |
+
'-ac', '1', # Mono channel
|
| 562 |
+
output_audio_file
|
| 563 |
+
]
|
| 564 |
+
|
| 565 |
+
try:
|
| 566 |
+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
| 567 |
+
print("FFmpeg stderr:", result.stderr.decode())
|
| 568 |
+
|
| 569 |
+
except subprocess.CalledProcessError as e:
|
| 570 |
+
logger.error(f"ffmpeg failed with error: {e.stderr.decode()}")
|
| 571 |
+
raise Exception(f"Audio extraction failed: {e.stderr.decode()}")
|
| 572 |
+
|
| 573 |
+
# Clean up temporary video file
|
| 574 |
+
os.remove(video_path)
|
| 575 |
+
|
| 576 |
+
else:
|
| 577 |
+
# If audio, save it directly
|
| 578 |
+
with open(output_audio_file, 'wb') as f:
|
| 579 |
+
for chunk in file.chunks():
|
| 580 |
+
f.write(chunk)
|
| 581 |
+
|
| 582 |
+
return output_audio_file
|
| 583 |
+
|
| 584 |
+
def format_time(seconds):
|
| 585 |
+
# Convert seconds to WebVTT time format (hh:mm:ss.mmm)
|
| 586 |
+
m, s = divmod(seconds, 60)
|
| 587 |
+
h, m = divmod(m, 60)
|
| 588 |
+
ms = int((s - int(s)) * 1000) # Milliseconds
|
| 589 |
+
return f"{int(h):02}:{int(m):02}:{int(s):02}.{ms:03}"
|
| 590 |
+
|
| 591 |
+
def generate_vtt(segments):
|
| 592 |
+
# Generate the VTT content from the Whisper segments
|
| 593 |
+
vtt_content = "WEBVTT\n\n"
|
| 594 |
+
|
| 595 |
+
for segment in segments:
|
| 596 |
+
start_time = segment['start']
|
| 597 |
+
end_time = segment['end']
|
| 598 |
+
text = segment['text']
|
| 599 |
+
|
| 600 |
+
# Convert seconds to WebVTT time format
|
| 601 |
+
start_time_str = format_time(start_time)
|
| 602 |
+
end_time_str = format_time(end_time)
|
| 603 |
+
|
| 604 |
+
vtt_content += f"{start_time_str} --> {end_time_str}\n{text}\n\n"
|
| 605 |
+
|
| 606 |
+
return vtt_content
|
| 607 |
+
|
| 608 |
+
def save_vtt(output_audio_file, vtt):
|
| 609 |
+
base_name = os.path.splitext(os.path.basename(output_audio_file))[0]
|
| 610 |
+
new_filename = base_name + ".vtt"
|
| 611 |
+
|
| 612 |
+
final_path = os.path.join(settings.MEDIA_ROOT, 'vtt', new_filename)
|
| 613 |
+
os.makedirs(os.path.dirname(final_path), exist_ok=True)
|
| 614 |
+
|
| 615 |
+
with open(final_path, "w", encoding="utf-8") as f:
|
| 616 |
+
f.write(vtt)
|
| 617 |
+
|
| 618 |
+
return final_path
|
| 619 |
+
|
| 620 |
+
def transcription(request):
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
transcription = None
|
| 624 |
+
error = None
|
| 625 |
+
progress_key = str(uuid.uuid4())
|
| 626 |
+
|
| 627 |
+
if request.method == "POST":
|
| 628 |
+
|
| 629 |
+
progress_key = request.POST.get("progress_key", progress_key)
|
| 630 |
+
|
| 631 |
+
model = globals.whisper_model
|
| 632 |
+
form = YouTubeURLForm(request.POST)
|
| 633 |
+
|
| 634 |
+
#if form.is_valid():
|
| 635 |
+
file = request.FILES.get('file')
|
| 636 |
+
if file:
|
| 637 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
|
| 638 |
+
# for chunk in file.chunks():
|
| 639 |
+
# temp_file.write(chunk)
|
| 640 |
+
# output_audio_file = temp_file.name
|
| 641 |
+
output_audio_file = handle_uploaded_file(file)
|
| 642 |
+
if os.path.getsize(output_audio_file) == 0:
|
| 643 |
+
raise RuntimeError("FFmpeg produced an empty audio file.")
|
| 644 |
+
|
| 645 |
+
print(f"transcribing : {output_audio_file}")
|
| 646 |
+
cache.set(progress_key, {'stage': 'transcribing', 'percent': 100})
|
| 647 |
+
result = model.transcribe(output_audio_file,verbose=False)
|
| 648 |
+
vtt = generate_vtt(result['segments'])
|
| 649 |
+
vtt_file = save_vtt(output_audio_file, vtt)
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
else:
|
| 653 |
+
cache.set(progress_key, {'stage': 'downloading', 'percent': 0})
|
| 654 |
+
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
|
| 655 |
+
|
| 656 |
+
def progress_hook(d):
|
| 657 |
+
# print(f"status {d['status']}")
|
| 658 |
+
if d['status'] == 'downloading':
|
| 659 |
+
# print(d)
|
| 660 |
+
percent_str = d.get('_percent_str', '0%').strip()
|
| 661 |
+
clean_str = ansi_escape.sub('', percent_str).strip()
|
| 662 |
+
# print(f"clean percent_str: {repr(clean_str)}") # e.g. '100.0%'
|
| 663 |
+
|
| 664 |
+
try:
|
| 665 |
+
match = re.search(r'(\d+(?:\.\d+)?)', clean_str)
|
| 666 |
+
if match:
|
| 667 |
+
percent = float(match.group(1))
|
| 668 |
+
else:
|
| 669 |
+
print("❌ Regex didn't match!")
|
| 670 |
+
percent = 0
|
| 671 |
+
except Exception as e:
|
| 672 |
+
print(f"❌ Error parsing percent: {e}")
|
| 673 |
+
percent = 0
|
| 674 |
+
|
| 675 |
+
# print(f"✅ current progress for {progress_key} is: {percent}")
|
| 676 |
+
cache.set(progress_key, {'stage': 'downloading', 'percent': percent})
|
| 677 |
+
|
| 678 |
+
url = request.POST.get('url')
|
| 679 |
+
unique_id = str(uuid.uuid4())
|
| 680 |
+
temp_dir = tempfile.gettempdir()
|
| 681 |
+
base_filename = f"temp_{unique_id}"
|
| 682 |
+
download_path = f"{temp_dir}/{base_filename}.%(ext)s"
|
| 683 |
+
# print(f"download_path: {download_path}")
|
| 684 |
+
output_audio_file = f"{temp_dir}/{base_filename}.mp3"
|
| 685 |
+
|
| 686 |
+
ydl_opts = {
|
| 687 |
+
'format': 'bestaudio/best',
|
| 688 |
+
'outtmpl': download_path, # No fixed extension!
|
| 689 |
+
'postprocessors': [{
|
| 690 |
+
'key': 'FFmpegExtractAudio',
|
| 691 |
+
'preferredcodec': 'mp3',
|
| 692 |
+
'preferredquality': '192',
|
| 693 |
+
}],
|
| 694 |
+
'progress_hooks': [progress_hook],
|
| 695 |
+
'quiet': True,
|
| 696 |
+
'no_warnings': True,
|
| 697 |
+
'noplaylist': True,
|
| 698 |
+
}
|
| 699 |
+
print(f"downloading : {url}")
|
| 700 |
+
try:
|
| 701 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 702 |
+
ydl.download([url])
|
| 703 |
+
print(f"transcribing : {output_audio_file}")
|
| 704 |
+
cache.set(progress_key, {'stage': 'transcribing', 'percent': 100})
|
| 705 |
+
result = model.transcribe(output_audio_file,verbose=False)
|
| 706 |
+
vtt = generate_vtt(result['segments'])
|
| 707 |
+
vtt_file = save_vtt(output_audio_file,vtt)
|
| 708 |
+
except Exception as e:
|
| 709 |
+
error = str(e)
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
# transcription = result['text']
|
| 713 |
+
|
| 714 |
+
# audio = AudioSegment.from_file(output_audio_file)
|
| 715 |
+
# chunk_length_ms = 60 * 1000 # 1-minute chunks
|
| 716 |
+
# chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
| 717 |
+
# results = []
|
| 718 |
+
# total_chunks = len(chunks)
|
| 719 |
+
# cache.set(progress_key, {'stage': 'transcribing', 'percent': 0})
|
| 720 |
+
|
| 721 |
+
# for i, chunk in enumerate(chunks):
|
| 722 |
+
# temp_filename = f"temp_chunk_{i}.wav"
|
| 723 |
+
# chunk.export(temp_filename, format="wav")
|
| 724 |
+
|
| 725 |
+
# result = model.transcribe(temp_filename, verbose=False)
|
| 726 |
+
# results.append(result["text"])
|
| 727 |
+
|
| 728 |
+
# os.remove(temp_filename)
|
| 729 |
+
|
| 730 |
+
# # Update progress
|
| 731 |
+
# percent = int((i + 1) / total_chunks * 100)
|
| 732 |
+
# cache.set(progress_key, {'stage': 'transcribing', 'percent': percent})
|
| 733 |
+
|
| 734 |
+
# # Combine all chunk texts
|
| 735 |
+
# transcription = "\n".join(results)
|
| 736 |
+
|
| 737 |
+
|
| 738 |
+
cache.set(progress_key, {'stage': 'done', 'percent': 100})
|
| 739 |
+
|
| 740 |
+
filename = os.path.basename(output_audio_file)
|
| 741 |
+
final_path = os.path.join(settings.MEDIA_ROOT, 'uploads', filename)
|
| 742 |
+
os.makedirs(os.path.dirname(final_path), exist_ok=True)
|
| 743 |
+
shutil.move(output_audio_file, final_path)
|
| 744 |
+
|
| 745 |
+
# Public URL
|
| 746 |
+
|
| 747 |
+
|
| 748 |
+
file_url = settings.MEDIA_URL + 'uploads/' + filename
|
| 749 |
+
audio_html = f'<audio controls><source src="{file_url}" type="audio/wav">Your browser does not support the audio element.</audio>'
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
return JsonResponse({'text': result['text'], 'segments': result['segments'], 'audio_file': audio_html })
|
| 753 |
+
# if os.path.exists(output_audio_file):
|
| 754 |
+
# os.remove(output_audio_file)
|
| 755 |
+
|
| 756 |
+
|
| 757 |
+
# return render(request, 'transcription.html', {
|
| 758 |
+
# 'form': form,
|
| 759 |
+
# 'transcription': transcription,
|
| 760 |
+
# 'error': error,
|
| 761 |
+
# 'progress_key': progress_key,
|
| 762 |
+
# })
|
| 763 |
+
|
| 764 |
+
else:
|
| 765 |
+
form = TranscribeForm()
|
| 766 |
+
|
| 767 |
+
return render(request, 'transcription.html', {
|
| 768 |
+
'form': form,
|
| 769 |
+
'transcription': transcription,
|
| 770 |
+
'error': error,
|
| 771 |
+
'progress_key': progress_key,
|
| 772 |
+
})
|
| 773 |
+
|
| 774 |
+
def csv_to_html_table(filepath):
|
| 775 |
+
def is_valid_url(url):
|
| 776 |
+
# URL pattern matching - must start with http:// or https://
|
| 777 |
+
url_pattern = re.compile(
|
| 778 |
+
r'^https?://' # must start with http:// or https://
|
| 779 |
+
r'([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+' # domain
|
| 780 |
+
r'[a-zA-Z]{2,}' # TLD
|
| 781 |
+
r'(/[a-zA-Z0-9-._~:/?#[\]@!$&\'()*+,;=]*)?$' # path and query
|
| 782 |
+
)
|
| 783 |
+
return bool(url_pattern.match(url))
|
| 784 |
+
|
| 785 |
+
html = '<table id="dataset" class="table table-bordered mt-2 smaller">'
|
| 786 |
+
with open(filepath, newline='') as csvfile:
|
| 787 |
+
reader = csv.reader(csvfile)
|
| 788 |
+
for i, row in enumerate(reader):
|
| 789 |
+
if i == 0:
|
| 790 |
+
html += '<thead>'
|
| 791 |
+
html += "<tr>" + "".join(f"<th>{col}</th>" for col in row) + "</tr>"
|
| 792 |
+
html += '</thead>'
|
| 793 |
+
else:
|
| 794 |
+
html += "<tr>" + "".join(
|
| 795 |
+
f'<td><a href="{col}" target="_blank" rel="noopener noreferrer">{col}</a></td>' if is_valid_url(col) else f"<td>{col}</td>"
|
| 796 |
+
for col in row
|
| 797 |
+
) + "</tr>"
|
| 798 |
+
html += "</table>"
|
| 799 |
+
return html
|
ai_api/widgets.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.forms.widgets import ClearableFileInput
|
| 2 |
+
|
| 3 |
+
class MultipleFileInput(ClearableFileInput):
|
| 4 |
+
allow_multiple_selected = True
|
| 5 |
+
|
csv_people.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import csv
|
| 3 |
+
|
| 4 |
+
# Path to the folder you want to scan
|
| 5 |
+
folder_path = 'people'
|
| 6 |
+
|
| 7 |
+
# Get all subfolder names
|
| 8 |
+
subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
|
| 9 |
+
|
| 10 |
+
# Path to the output CSV file
|
| 11 |
+
csv_file = 'subfolders.csv'
|
| 12 |
+
|
| 13 |
+
# Write the subfolder names to the CSV file
|
| 14 |
+
with open(csv_file, mode='w', newline='') as file:
|
| 15 |
+
writer = csv.writer(file)
|
| 16 |
+
writer.writerow(['Subfolder Name']) # Write the header
|
| 17 |
+
for subfolder in subfolders:
|
| 18 |
+
writer.writerow([subfolder]) # Write each subfolder name
|
| 19 |
+
|
| 20 |
+
print(f"Subfolder names have been written to {csv_file}")
|
delete_milvus.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymilvus import Collection, connections
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
milvus_host = os.getenv("MILVUS_HOST", "localhost") # default localhost
|
| 8 |
+
milvus_port = os.getenv("MILVUS_PORT", "19530") # default 19530
|
| 9 |
+
|
| 10 |
+
connections.connect("default", host=milvus_host, port=int(milvus_port))
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Now, connect to the collection
|
| 14 |
+
collection = Collection("faces")
|
| 15 |
+
|
| 16 |
+
# Query the collection to find entries where the 'name' field is empty or None
|
| 17 |
+
query = 'name == "YAB DATO SERI ANWAR IBRAHIM"' # Looking for entities where 'name' is empty
|
| 18 |
+
|
| 19 |
+
# Perform the query to find entities with empty 'name' fields
|
| 20 |
+
results = collection.query(query, output_fields=["id", "name"])
|
| 21 |
+
|
| 22 |
+
# Check and delete entities with empty 'name'
|
| 23 |
+
if results:
|
| 24 |
+
ids_to_delete = [str(result["id"]) for result in results]
|
| 25 |
+
id_expr = f"id in [{', '.join(ids_to_delete)}]"
|
| 26 |
+
collection.delete(expr=id_expr)
|
| 27 |
+
print(f"✅ Deleted entities: {ids_to_delete}")
|
| 28 |
+
else:
|
| 29 |
+
print("❌ No entities found for deletion.")
|
devlab_next/.gitignore
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python bytecode files
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
__pycache__/
|
| 6 |
+
|
| 7 |
+
# Virtual environment
|
| 8 |
+
venv/
|
| 9 |
+
env/
|
| 10 |
+
|
| 11 |
+
# Distribution / packaging
|
| 12 |
+
*.egg
|
| 13 |
+
*.egg-info
|
| 14 |
+
dist/
|
| 15 |
+
build/
|
| 16 |
+
*.whl
|
| 17 |
+
|
| 18 |
+
# IDE files
|
| 19 |
+
.idea/
|
| 20 |
+
.vscode/
|
| 21 |
+
|
| 22 |
+
# Jupyter Notebook files
|
| 23 |
+
.ipynb_checkpoints
|
| 24 |
+
|
| 25 |
+
# PyInstaller
|
| 26 |
+
*.manifest
|
| 27 |
+
*.spec
|
| 28 |
+
|
| 29 |
+
# Test and coverage reports
|
| 30 |
+
.coverage
|
| 31 |
+
*.coveragerc
|
| 32 |
+
nosetests.xml
|
| 33 |
+
coverage.xml
|
| 34 |
+
*.coveralls.yml
|
| 35 |
+
|
| 36 |
+
# MyPy
|
| 37 |
+
.mypy_cache/
|
| 38 |
+
.dmypy.json
|
| 39 |
+
dmypy.json
|
| 40 |
+
|
| 41 |
+
# Pytest
|
| 42 |
+
.cache/
|
| 43 |
+
|
| 44 |
+
# Sphinx documentation
|
| 45 |
+
docs/_build/
|
| 46 |
+
|
| 47 |
+
# pytest and flake8
|
| 48 |
+
*.log
|
| 49 |
+
|
| 50 |
+
# VS Code settings
|
| 51 |
+
.vscode/
|
| 52 |
+
|
| 53 |
+
# Django secrets
|
| 54 |
+
*.env
|
| 55 |
+
|
| 56 |
+
# Flask instance folder
|
| 57 |
+
instance/
|
| 58 |
+
|
| 59 |
+
# PyCharm project files
|
| 60 |
+
.idea/
|
| 61 |
+
|
| 62 |
+
# Other Python-related files
|
| 63 |
+
*.bak
|
| 64 |
+
*.swp
|
| 65 |
+
*.swo
|
| 66 |
+
ddet_classification/
|
| 67 |
+
.DS_Store
|
| 68 |
+
.pkl
|
devlab_next/__init__.py
ADDED
|
File without changes
|
devlab_next/asgi.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ASGI config for devlab_next project.
|
| 3 |
+
|
| 4 |
+
It exposes the ASGI callable as a module-level variable named ``application``.
|
| 5 |
+
|
| 6 |
+
For more information on this file, see
|
| 7 |
+
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
from django.core.asgi import get_asgi_application
|
| 13 |
+
|
| 14 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'devlab_next.settings')
|
| 15 |
+
|
| 16 |
+
application = get_asgi_application()
|
devlab_next/settings.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Django settings for devlab_next project.
|
| 3 |
+
|
| 4 |
+
Generated by 'django-admin startproject' using Django 4.2.7.
|
| 5 |
+
|
| 6 |
+
For more information on this file, see
|
| 7 |
+
https://docs.djangoproject.com/en/4.2/topics/settings/
|
| 8 |
+
|
| 9 |
+
For the full list of settings and their values, see
|
| 10 |
+
https://docs.djangoproject.com/en/4.2/ref/settings/
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
| 17 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Quick-start development settings - unsuitable for production
|
| 21 |
+
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
| 22 |
+
|
| 23 |
+
# SECURITY WARNING: keep the secret key used in production secret!
|
| 24 |
+
SECRET_KEY = 'django-insecure-5a87e9*^s30hb+%+h@t^06493w2tpv7w6%+(0!#iu77b%*8=#i'
|
| 25 |
+
|
| 26 |
+
# SECURITY WARNING: don't run with debug turned on in production!
|
| 27 |
+
DEBUG = True
|
| 28 |
+
|
| 29 |
+
ALLOWED_HOSTS = ['127.0.0.1','fctestbed.bernama.com','localhost']
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# Application definition
|
| 33 |
+
|
| 34 |
+
INSTALLED_APPS = [
|
| 35 |
+
'django.contrib.admin',
|
| 36 |
+
'django.contrib.auth',
|
| 37 |
+
'django.contrib.contenttypes',
|
| 38 |
+
'django.contrib.sessions',
|
| 39 |
+
'django.contrib.messages',
|
| 40 |
+
'django.contrib.staticfiles',
|
| 41 |
+
'rest_framework',
|
| 42 |
+
# 'ai_api',
|
| 43 |
+
'ai_api.apps.AiApiConfig',
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
MIDDLEWARE = [
|
| 47 |
+
'django.middleware.security.SecurityMiddleware',
|
| 48 |
+
'django.contrib.sessions.middleware.SessionMiddleware',
|
| 49 |
+
'django.middleware.common.CommonMiddleware',
|
| 50 |
+
'django.middleware.csrf.CsrfViewMiddleware',
|
| 51 |
+
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
| 52 |
+
'django.contrib.messages.middleware.MessageMiddleware',
|
| 53 |
+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
| 54 |
+
# 'ai_api.middleware.HMACAuthMiddleware'
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
ROOT_URLCONF = 'devlab_next.urls'
|
| 58 |
+
|
| 59 |
+
TEMPLATES = [
|
| 60 |
+
{
|
| 61 |
+
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
| 62 |
+
'DIRS': [],
|
| 63 |
+
'APP_DIRS': True,
|
| 64 |
+
'OPTIONS': {
|
| 65 |
+
'context_processors': [
|
| 66 |
+
'django.template.context_processors.debug',
|
| 67 |
+
'django.template.context_processors.request',
|
| 68 |
+
'django.contrib.auth.context_processors.auth',
|
| 69 |
+
'django.contrib.messages.context_processors.messages',
|
| 70 |
+
],
|
| 71 |
+
},
|
| 72 |
+
},
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
WSGI_APPLICATION = 'devlab_next.wsgi.application'
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# Database
|
| 79 |
+
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
|
| 80 |
+
|
| 81 |
+
# DATABASES = {
|
| 82 |
+
# 'default': {
|
| 83 |
+
# 'ENGINE': 'django.db.backends.sqlite3',
|
| 84 |
+
# 'NAME': BASE_DIR / 'db.sqlite3',
|
| 85 |
+
# }
|
| 86 |
+
# }
|
| 87 |
+
|
| 88 |
+
DATABASES = {
|
| 89 |
+
"default": {
|
| 90 |
+
"ENGINE": "django.db.backends.postgresql",
|
| 91 |
+
"NAME": os.environ.get("DB_NAME", "factcheckapidb"),
|
| 92 |
+
"USER": os.environ.get("DB_USER", "postgres"),
|
| 93 |
+
"PASSWORD": os.environ.get("DB_PASSWORD", "postgres"),
|
| 94 |
+
"HOST": os.environ.get("DB_HOST", "127.0.0.1"),
|
| 95 |
+
"PORT": os.environ.get("DB_PORT", "5432"),
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# Password validation
|
| 102 |
+
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
| 103 |
+
|
| 104 |
+
AUTH_PASSWORD_VALIDATORS = [
|
| 105 |
+
{
|
| 106 |
+
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
| 116 |
+
},
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Internationalization
|
| 121 |
+
# https://docs.djangoproject.com/en/4.2/topics/i18n/
|
| 122 |
+
|
| 123 |
+
LANGUAGE_CODE = 'en-us'
|
| 124 |
+
|
| 125 |
+
TIME_ZONE = 'UTC'
|
| 126 |
+
|
| 127 |
+
USE_I18N = True
|
| 128 |
+
|
| 129 |
+
USE_TZ = True
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# Static files (CSS, JavaScript, Images)
|
| 133 |
+
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
| 134 |
+
|
| 135 |
+
STATIC_URL = '/static/'
|
| 136 |
+
# STATIC_ROOT = BASE_DIR / 'static/'
|
| 137 |
+
|
| 138 |
+
STATICFILES_DIRS = [
|
| 139 |
+
os.path.join(BASE_DIR, 'static'),
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# Default primary key field type
|
| 145 |
+
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
| 146 |
+
|
| 147 |
+
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
| 148 |
+
|
| 149 |
+
MEDIA_URL = '/media/'
|
| 150 |
+
MEDIA_ROOT = BASE_DIR / 'media'
|
| 151 |
+
|
| 152 |
+
CACHES = {
|
| 153 |
+
'default': {
|
| 154 |
+
'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', # In-memory
|
| 155 |
+
'LOCATION': 'progress-cache',
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
devlab_next/urls.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
URL configuration for devlab_next project.
|
| 3 |
+
|
| 4 |
+
The `urlpatterns` list routes URLs to views. For more information please see:
|
| 5 |
+
https://docs.djangoproject.com/en/4.2/topics/http/urls/
|
| 6 |
+
Examples:
|
| 7 |
+
Function views
|
| 8 |
+
1. Add an import: from my_app import views
|
| 9 |
+
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
| 10 |
+
Class-based views
|
| 11 |
+
1. Add an import: from other_app.views import Home
|
| 12 |
+
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
| 13 |
+
Including another URLconf
|
| 14 |
+
1. Import the include() function: from django.urls import include, path
|
| 15 |
+
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
| 16 |
+
"""
|
| 17 |
+
from django.contrib import admin
|
| 18 |
+
from django.urls import path, include
|
| 19 |
+
from django.conf import settings
|
| 20 |
+
from django.conf.urls.static import static
|
| 21 |
+
import os
|
| 22 |
+
|
| 23 |
+
admin.site.site_header = "BERNAMA Fact Check"
|
| 24 |
+
admin.site.site_title = "BERNAMA Fact Check Portal"
|
| 25 |
+
admin.site.index_title = "Dashboard"
|
| 26 |
+
|
| 27 |
+
urlpatterns = [
|
| 28 |
+
path('admin/', admin.site.urls),
|
| 29 |
+
path('', include('ai_api.urls')),
|
| 30 |
+
path('api/v1/', include('ai_api.api_urls')),
|
| 31 |
+
]+ static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
| 32 |
+
|
| 33 |
+
urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
|
devlab_next/wsgi.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
WSGI config for devlab_next project.
|
| 3 |
+
|
| 4 |
+
It exposes the WSGI callable as a module-level variable named ``application``.
|
| 5 |
+
|
| 6 |
+
For more information on this file, see
|
| 7 |
+
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
from django.core.wsgi import get_wsgi_application
|
| 13 |
+
|
| 14 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'devlab_next.settings')
|
| 15 |
+
|
| 16 |
+
application = get_wsgi_application()
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.5'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
web:
|
| 5 |
+
build: .
|
| 6 |
+
container_name: django_app
|
| 7 |
+
mem_limit: 16g
|
| 8 |
+
command: gunicorn devlab_next.wsgi:application --bind 0.0.0.0:8000 --workers 3 --log-level debug
|
| 9 |
+
volumes:
|
| 10 |
+
- .:/app
|
| 11 |
+
ports:
|
| 12 |
+
- "8000:8000"
|
| 13 |
+
depends_on:
|
| 14 |
+
- milvus-standalone
|
| 15 |
+
environment:
|
| 16 |
+
- DJANGO_SETTINGS_MODULE=devlab_next.settings
|
| 17 |
+
- TF_CPP_MIN_LOG_LEVEL=2
|
| 18 |
+
networks:
|
| 19 |
+
- milvus_network
|
| 20 |
+
|
| 21 |
+
milvus-standalone:
|
| 22 |
+
container_name: milvus
|
| 23 |
+
image: milvusdb/milvus:v2.5.8
|
| 24 |
+
command: ["milvus", "run", "standalone"]
|
| 25 |
+
security_opt:
|
| 26 |
+
- seccomp:unconfined
|
| 27 |
+
restart: always
|
| 28 |
+
ports:
|
| 29 |
+
- "19530:19530" # gRPC
|
| 30 |
+
- "19121:19121" # HTTP (correct health port)
|
| 31 |
+
volumes:
|
| 32 |
+
- ./volumes/milvus:/var/lib/milvus
|
| 33 |
+
healthcheck:
|
| 34 |
+
test: ["CMD", "curl", "-f", "http://localhost:19121/healthz"]
|
| 35 |
+
interval: 30s
|
| 36 |
+
start_period: 90s
|
| 37 |
+
timeout: 20s
|
| 38 |
+
retries: 3
|
| 39 |
+
depends_on:
|
| 40 |
+
- etcd
|
| 41 |
+
- minio
|
| 42 |
+
environment:
|
| 43 |
+
ETCD_ENDPOINTS: etcd:2379
|
| 44 |
+
MINIO_ADDRESS: minio:9000
|
| 45 |
+
MINIO_ACCESS_KEY: minioadmin
|
| 46 |
+
MINIO_SECRET_KEY: minioadmin
|
| 47 |
+
MILVUS_LOG_LEVEL: debug
|
| 48 |
+
networks:
|
| 49 |
+
- milvus_network
|
| 50 |
+
|
| 51 |
+
etcd:
|
| 52 |
+
image: quay.io/coreos/etcd:v3.5.18
|
| 53 |
+
container_name: etcd
|
| 54 |
+
command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
|
| 55 |
+
environment:
|
| 56 |
+
- ETCD_AUTO_COMPACTION_MODE=revision
|
| 57 |
+
- ETCD_AUTO_COMPACTION_RETENTION=1000
|
| 58 |
+
- ETCD_QUOTA_BACKEND_BYTES=4294967296
|
| 59 |
+
- ETCD_SNAPSHOT_COUNT=50000
|
| 60 |
+
volumes:
|
| 61 |
+
- ./volumes/etcd:/etcd
|
| 62 |
+
healthcheck:
|
| 63 |
+
test: ["CMD", "etcdctl", "endpoint", "health"]
|
| 64 |
+
interval: 30s
|
| 65 |
+
timeout: 20s
|
| 66 |
+
retries: 3
|
| 67 |
+
ports:
|
| 68 |
+
- "2379:2379"
|
| 69 |
+
- "2380:2380"
|
| 70 |
+
networks:
|
| 71 |
+
- milvus_network
|
| 72 |
+
|
| 73 |
+
minio:
|
| 74 |
+
container_name: minio
|
| 75 |
+
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
|
| 76 |
+
environment:
|
| 77 |
+
MINIO_ACCESS_KEY: minioadmin
|
| 78 |
+
MINIO_SECRET_KEY: minioadmin
|
| 79 |
+
command: minio server /minio_data --console-address ":9001"
|
| 80 |
+
ports:
|
| 81 |
+
- "9000:9000"
|
| 82 |
+
- "9001:9001"
|
| 83 |
+
volumes:
|
| 84 |
+
- ./volumes/minio:/minio_data
|
| 85 |
+
healthcheck:
|
| 86 |
+
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
| 87 |
+
interval: 30s
|
| 88 |
+
timeout: 20s
|
| 89 |
+
retries: 3
|
| 90 |
+
networks:
|
| 91 |
+
- milvus_network
|
| 92 |
+
|
| 93 |
+
networks:
|
| 94 |
+
milvus_network:
|
| 95 |
+
driver: bridge
|
download_people.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ai_api.library.devlab_image import DevLabImage
|
| 2 |
+
import csv
|
| 3 |
+
|
| 4 |
+
devlab_image = DevLabImage()
|
| 5 |
+
|
| 6 |
+
# # Open and read the CSV file
|
| 7 |
+
with open("subfolders.csv", mode="r", encoding="utf-8") as file:
|
| 8 |
+
reader = csv.reader(file)
|
| 9 |
+
for row in reader:
|
| 10 |
+
print(row[0], row[1]) # Each row is a list
|
| 11 |
+
devlab_image.register_person(row[0],row[1])
|
| 12 |
+
|
| 13 |
+
# field_value = input("Enter the name: ")
|
| 14 |
+
# devlab_image.download_person_images(field_value.upper())
|
list_faces.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymilvus import Collection, connections
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
milvus_host = os.getenv("MILVUS_HOST", "localhost") # default localhost
|
| 8 |
+
milvus_port = os.getenv("MILVUS_PORT", "19530") # default 19530
|
| 9 |
+
|
| 10 |
+
connections.connect("default", host=milvus_host, port=int(milvus_port))
|
| 11 |
+
|
| 12 |
+
# Now, connect to the collection
|
| 13 |
+
collection = Collection("faces")
|
| 14 |
+
|
| 15 |
+
# Query expression that retrieves all documents with a non-null 'id' (or use any valid field)
|
| 16 |
+
query = "id IS NOT NULL" # Valid query expression to fetch all documents
|
| 17 |
+
|
| 18 |
+
# Retrieve all documents, adjust fields based on your collection schema
|
| 19 |
+
results = collection.query(query, output_fields=["id", "name"])
|
| 20 |
+
|
| 21 |
+
# Print all results
|
| 22 |
+
for result in results:
|
| 23 |
+
print(f"ID: {result['id']}, Name: {result.get('name', 'N/A')}")
|