Samir87699 commited on
Commit
b063251
·
1 Parent(s): b0601c2

Final Deploy

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ venv/
4
+ .env
5
+ .DS_Store
app/api/routes.py CHANGED
@@ -1,8 +1,11 @@
1
- from fastapi import APIRouter, UploadFile, File, Form, HTTPException
2
  from typing import Optional
3
  import logging
4
- from app.services.detector import analysis_service
 
5
  from app.services.ocr import ocr_service
 
 
6
  from app.models.schemas import AnalysisResponse, HealthResponse
7
 
8
  router = APIRouter()
@@ -10,50 +13,82 @@ logger = logging.getLogger("uvicorn")
10
 
11
  @router.get("/health", response_model=HealthResponse)
12
  async def health_check():
13
- return {"status": "ok", "message": "VerifAI Backend is running"}
14
 
15
  @router.post("/analyze", response_model=AnalysisResponse)
16
  async def analyze(
 
17
  text: Optional[str] = Form(None),
18
  file: Optional[UploadFile] = File(None)
19
  ):
20
  """
21
- Main analysis endpoint.
22
- Accepts 'text' (Form data) OR 'file' (UploadFile).
23
  """
24
- content_to_analyze = ""
25
- source_type = "text"
26
-
27
  try:
 
 
 
 
28
  if text:
29
- content_to_analyze = text
30
- logger.info("Received text analysis request")
31
- elif file:
32
- logger.info(f"Received file analysis request: {file.filename}")
33
- source_type = "image"
34
- # Read file bytes
35
- contents = await file.read()
36
- # Run OCR
37
- extracted_text = ocr_service.extract_text(contents)
38
- if not extracted_text:
39
- logger.warning("OCR failed to extract text")
40
- raise HTTPException(status_code=400, detail="Could not extract text from image")
41
- content_to_analyze = extracted_text
42
- logger.info(f"OCR Success. Extracted {len(content_to_analyze)} chars")
43
- else:
44
- raise HTTPException(status_code=400, detail="No text or file provided")
45
 
46
- if len(content_to_analyze.split()) < 5:
47
- raise HTTPException(status_code=400, detail="Text too short for analysis (min 5 words)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # Run Analysis
50
- result = analysis_service.analyze(content_to_analyze)
51
- logger.info(f"Analysis Complete. Score: {result['score']}")
52
-
53
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- except HTTPException as he:
56
- raise he
57
  except Exception as e:
58
  logger.error(f"Analysis Error: {str(e)}")
59
- raise HTTPException(status_code=500, detail="Internal Server Error during analysis")
 
1
+ from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Request
2
  from typing import Optional
3
  import logging
4
+ from fastapi.concurrency import run_in_threadpool
5
+ from app.services.detector import analysis_service as text_service
6
  from app.services.ocr import ocr_service
7
+ from app.services.image_detector import image_analysis_service
8
+ from app.services.video_detector import video_analysis_service
9
  from app.models.schemas import AnalysisResponse, HealthResponse
10
 
11
  router = APIRouter()
 
13
 
14
  @router.get("/health", response_model=HealthResponse)
15
  async def health_check():
16
+ return {"status": "ok", "message": "VerifAI Backend is running (Text, Image, Video)"}
17
 
18
  @router.post("/analyze", response_model=AnalysisResponse)
19
  async def analyze(
20
+ request: Request,
21
  text: Optional[str] = Form(None),
22
  file: Optional[UploadFile] = File(None)
23
  ):
24
  """
25
+ Multi-Modal Analysis Endpoint.
 
26
  """
 
 
 
27
  try:
28
+ # Debug Logging
29
+ logger.info(f"Analyze Request: Content-Type={request.headers.get('content-type')}")
30
+
31
+ # 1. Direct Text Input
32
  if text:
33
+ logger.info("Analyzing Text Input")
34
+ # Run text analysis in threadpool to avoid blocking event loop
35
+ return await run_in_threadpool(text_service.analyze, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # 2. File Input
38
+ if file:
39
+ content_type = file.content_type or ""
40
+ logger.info(f"Analyzing File: {file.filename} ({content_type})")
41
+
42
+ file_bytes = await file.read()
43
+
44
+ # --- VIDEO ANALYSIS ---
45
+ if "video" in content_type:
46
+ logger.info("Running Video Analysis")
47
+ result = await run_in_threadpool(video_analysis_service.analyze_video, file_bytes)
48
+ return {
49
+ "score": result['score'],
50
+ "verdict": result['verdict'],
51
+ "media_details": {"type": "video", "frames": result.get('frame_details')}
52
+ }
53
 
54
+ # --- IMAGE ANALYSIS (Dual Mode: AI Check + OCR) ---
55
+ if "image" in content_type:
56
+ logger.info("Running Image Analysis")
57
+
58
+ # A. Check if Image is AI-Generated (Threadpool)
59
+ image_result = await run_in_threadpool(image_analysis_service.analyze_image, file_bytes)
60
+ logger.info(f"Image AI Score: {image_result['score']}")
61
+
62
+ # If heavily AI, return immediately
63
+ if image_result['score'] > 80:
64
+ return {
65
+ "score": image_result['score'],
66
+ "verdict": "AI-Generated Image",
67
+ "media_details": {"type": "image", "detail": "Image detected as AI generated"}
68
+ }
69
+
70
+ # B. OCR Check (Threadpool)
71
+ extracted_text = await run_in_threadpool(ocr_service.extract_text, file_bytes)
72
+
73
+ if extracted_text and len(extracted_text.split()) > 5:
74
+ logger.info("OCR found text, analyzing integrity...")
75
+ text_result = await run_in_threadpool(text_service.analyze, extracted_text)
76
+
77
+ text_result["media_details"] = {
78
+ "type": "image_ocr",
79
+ "image_ai_score": image_result['score']
80
+ }
81
+ return text_result
82
+
83
+ # C. No text found? Return Image Analysis
84
+ return {
85
+ "score": image_result['score'],
86
+ "verdict": image_result['verdict'] if image_result['score'] > 50 else "Real/No Text",
87
+ "media_details": {"type": "image", "detail": "No readable text found"}
88
+ }
89
+
90
+ raise HTTPException(status_code=400, detail="No text or valid file provided")
91
 
 
 
92
  except Exception as e:
93
  logger.error(f"Analysis Error: {str(e)}")
94
+ raise HTTPException(status_code=500, detail=str(e))
app/core/config.py CHANGED
@@ -1,14 +1,25 @@
1
  import os
 
2
 
3
- class Settings:
4
  PROJECT_NAME: str = "VerifAI Backend"
5
  API_V1_STR: str = "/api/v1"
6
 
7
- # Models
8
- DETECTOR_MODEL_NAME: str = "Hello-SimpleAI/chatgpt-detector-roberta"
 
 
 
 
 
 
 
9
  METRIC_MODEL_NAME: str = "gpt2"
10
 
11
- # OCR
12
- OCR_LANGUAGES: list = ['en']
 
 
 
13
 
14
  settings = Settings()
 
1
  import os
2
+ from pydantic_settings import BaseSettings
3
 
4
+ class Settings(BaseSettings):
5
  PROJECT_NAME: str = "VerifAI Backend"
6
  API_V1_STR: str = "/api/v1"
7
 
8
+ # Cloud Deployment Configuration
9
+ # We use '0.0.0.0' for binding to all interfaces in Docker
10
+ HOST: str = "0.0.0.0"
11
+ # Port 7860 is the default for Hugging Face Spaces
12
+ PORT: int = int(os.environ.get("PORT", 7860))
13
+
14
+ # AI Models
15
+ # Text: Fakespot-AI model (RoBERTa-base finetuned)
16
+ TEXT_MODEL_NAME: str = "fakespot-ai/roberta-base-ai-text-detection-v1"
17
  METRIC_MODEL_NAME: str = "gpt2"
18
 
19
+ # Image: Distilled model for detecting AI generated images
20
+ IMAGE_MODEL_NAME: str = "umm-maybe/AI-image-detector"
21
+
22
+ class Config:
23
+ case_sensitive = True
24
 
25
  settings = Settings()
app/models/schemas.py CHANGED
@@ -13,8 +13,10 @@ class Segment(BaseModel):
13
  class AnalysisResponse(BaseModel):
14
  score: float
15
  verdict: str
16
- metrics: Metrics
17
- segments: list[Segment]
 
 
18
 
19
  class HealthResponse(BaseModel):
20
  status: str
 
13
  class AnalysisResponse(BaseModel):
14
  score: float
15
  verdict: str
16
+ metrics: Optional[Metrics] = None
17
+ segments: Optional[list[Segment]] = None
18
+ error: Optional[str] = None
19
+ media_details: Optional[dict] = None # For Image/Video specific details
20
 
21
  class HealthResponse(BaseModel):
22
  status: str
app/services/detector.py CHANGED
@@ -4,22 +4,42 @@ import math
4
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
5
  from app.core.config import settings
6
 
7
- class AnalysisService:
8
  def __init__(self):
9
- print("Loading AI Models... This might take a moment.")
10
 
11
- # 1. Load Detector (Classification)
12
- self.detector_tokenizer = AutoTokenizer.from_pretrained(settings.DETECTOR_MODEL_NAME)
13
- self.detector_model = AutoModelForSequenceClassification.from_pretrained(settings.DETECTOR_MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # 2. Load Metric Model (Perplexity - GPT2)
16
- self.metric_tokenizer = AutoTokenizer.from_pretrained(settings.METRIC_MODEL_NAME)
17
- self.metric_model = AutoModelForCausalLM.from_pretrained(settings.METRIC_MODEL_NAME)
 
 
 
 
 
18
 
19
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
  self.detector_model.to(self.device)
21
- self.metric_model.to(self.device)
22
- print(f"Models loaded on {self.device}")
 
23
 
24
  def calculate_perplexity(self, text):
25
  """
@@ -27,6 +47,9 @@ class AnalysisService:
27
  Lower perplexity = implementation of model training data = likely AI.
28
  Higher perplexity = more random/creative = likely Human.
29
  """
 
 
 
30
  encodings = self.metric_tokenizer(text, return_tensors="pt")
31
  input_ids = encodings.input_ids.to(self.device)
32
 
@@ -69,7 +92,9 @@ class AnalysisService:
69
  logits = outputs.logits
70
  probs = torch.softmax(logits, dim=1)
71
 
72
- # Hello-SimpleAI/chatgpt-detector-roberta: Label 0 is "Human", Label 1 is "ChatGPT" (AI)
 
 
73
  ai_prob = probs[0][1].item() * 100
74
 
75
  return ai_prob
@@ -103,7 +128,7 @@ class AnalysisService:
103
  verdict = "Human"
104
  if ai_probability > 80:
105
  verdict = "AI-Generated"
106
- elif ai_probability > 40:
107
  verdict = "Mixed/Uncertain"
108
 
109
  return {
@@ -116,4 +141,4 @@ class AnalysisService:
116
  "segments": segments
117
  }
118
 
119
- analysis_service = AnalysisService()
 
4
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
5
  from app.core.config import settings
6
 
7
+ class TextAnalysisService:
8
  def __init__(self):
9
+ print("Loading Text AI Models... This might take a moment.")
10
 
11
+ # 1. Load Detector (Classification) - Fakespot-AI
12
+ try:
13
+ # Use custom cache to be safe
14
+ cache_dir = "/tmp/hf_cache"
15
+
16
+ self.detector_tokenizer = AutoTokenizer.from_pretrained(
17
+ settings.TEXT_MODEL_NAME,
18
+ cache_dir=cache_dir
19
+ )
20
+ self.detector_model = AutoModelForSequenceClassification.from_pretrained(
21
+ settings.TEXT_MODEL_NAME,
22
+ cache_dir=cache_dir
23
+ )
24
+ print(f"Model ID2LABEL: {self.detector_model.config.id2label}")
25
+ except Exception as e:
26
+ print(f"Error loading text detector: {e}")
27
+ raise e
28
 
29
  # 2. Load Metric Model (Perplexity - GPT2)
30
+ try:
31
+ self.metric_tokenizer = AutoTokenizer.from_pretrained(settings.METRIC_MODEL_NAME)
32
+ self.metric_model = AutoModelForCausalLM.from_pretrained(settings.METRIC_MODEL_NAME)
33
+ except Exception as e:
34
+ print(f"Error loading metric model: {e}")
35
+ # Non-critical failure for metrics
36
+ self.metric_model = None
37
 
38
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
39
  self.detector_model.to(self.device)
40
+ if self.metric_model:
41
+ self.metric_model.to(self.device)
42
+ print(f"Text Models loaded on {self.device}")
43
 
44
  def calculate_perplexity(self, text):
45
  """
 
47
  Lower perplexity = implementation of model training data = likely AI.
48
  Higher perplexity = more random/creative = likely Human.
49
  """
50
+ if not self.metric_model:
51
+ return 0.0
52
+
53
  encodings = self.metric_tokenizer(text, return_tensors="pt")
54
  input_ids = encodings.input_ids.to(self.device)
55
 
 
92
  logits = outputs.logits
93
  probs = torch.softmax(logits, dim=1)
94
 
95
+ # Fakespot uses Label 0 = Real, Label 1 = Fake (AI)
96
+ # Our test showed "Messy Human" got 99% on index 0 (Real).
97
+ # So prob[1] is the AI Probability.
98
  ai_prob = probs[0][1].item() * 100
99
 
100
  return ai_prob
 
128
  verdict = "Human"
129
  if ai_probability > 80:
130
  verdict = "AI-Generated"
131
+ elif ai_probability > 50: # Lower threshold slightly for this robust model
132
  verdict = "Mixed/Uncertain"
133
 
134
  return {
 
141
  "segments": segments
142
  }
143
 
144
+ analysis_service = TextAnalysisService() # Keep variable name compatible for now or update routes
app/services/image_detector.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from PIL import Image
3
+ import io
4
+ from app.core.config import settings
5
+
6
+ class ImageAnalysisService:
7
+ def __init__(self):
8
+ print("Loading Image AI Model... This might take a moment.")
9
+ try:
10
+ # Load Image Classification Pipeline
11
+ # umm-maybe/AI-image-detector typically returns labels like "artificial" or "human"
12
+ self.classifier = pipeline("image-classification", model=settings.IMAGE_MODEL_NAME)
13
+ print("Image Model loaded successfully.")
14
+ except Exception as e:
15
+ print(f"Error loading image model: {e}")
16
+ self.classifier = None
17
+
18
+ def analyze_image(self, image_data: bytes):
19
+ if not self.classifier:
20
+ return {"score": 0, "verdict": "Error: Model not loaded"}
21
+
22
+ try:
23
+ # Convert bytes to PIL Image
24
+ image = Image.open(io.BytesIO(image_data)).convert("RGB")
25
+
26
+ # Run prediction
27
+ results = self.classifier(image)
28
+ # Results are typically a list of dicts: [{'label': 'artificial', 'score': 0.99}, {'label': 'human', 'score': 0.01}]
29
+
30
+ ai_score = 0.0
31
+
32
+ # Parse results to find 'artificial' or equivalent label
33
+ for result in results:
34
+ label = result['label'].lower()
35
+ if label in ['artificial', 'ai', 'fake', 'generated']:
36
+ ai_score = result['score'] * 100
37
+ break
38
+ elif label in ['human', 'real']:
39
+ # If label is explicitly human, the inverse is AI score (roughly)
40
+ # But usually the classifier returns both, so we just look for the AI one.
41
+ pass
42
+
43
+ verdict = "Human"
44
+ if ai_score > 90:
45
+ verdict = "AI-Generated"
46
+ elif ai_score > 60:
47
+ verdict = "Likely AI"
48
+ elif ai_score > 40:
49
+ verdict = "Mixed/Uncertain"
50
+
51
+ return {
52
+ "score": round(ai_score, 2),
53
+ "verdict": verdict,
54
+ "details": results # Return raw details for debugging if needed
55
+ }
56
+
57
+ except Exception as e:
58
+ print(f"Image Analysis Error: {e}")
59
+ return {"score": 0, "verdict": "Analysis Failed"}
60
+
61
+ image_analysis_service = ImageAnalysisService()
app/services/video_detector.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import tempfile
4
+ import numpy as np
5
+ from app.services.image_detector import image_analysis_service
6
+
7
+ class VideoAnalysisService:
8
+ def __init__(self):
9
+ # Relies on Image Service
10
+ pass
11
+
12
+ def analyze_video(self, video_bytes: bytes):
13
+ """
14
+ Analyzes a video by extracting keyframes and checking them for AI content.
15
+ """
16
+ if not image_analysis_service.classifier:
17
+ return {"score": 0, "verdict": "Error: Image Model not loaded"}
18
+
19
+ # 1. Save bytes to temp file for OpenCV
20
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp:
21
+ temp.write(video_bytes)
22
+ temp_path = temp.name
23
+
24
+ try:
25
+ cap = cv2.VideoCapture(temp_path)
26
+ if not cap.isOpened():
27
+ return {"score": 0, "verdict": "Error: Could not open video"}
28
+
29
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
30
+ fps = cap.get(cv2.CAP_PROP_FPS)
31
+
32
+ # 2. Extract Keyframes (Start, 25%, 50%, 75%, End)
33
+ # Limit analysis to at most 5 frames to save speed
34
+ sample_points = [0, 0.25, 0.5, 0.75, 0.95]
35
+ frames_to_check = []
36
+
37
+ for point in sample_points:
38
+ frame_idx = int(point * total_frames)
39
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
40
+ ret, frame = cap.read()
41
+ if ret:
42
+ frames_to_check.append(frame)
43
+
44
+ cap.release()
45
+
46
+ # 3. Analyze Frames
47
+ frame_scores = []
48
+ frame_details = []
49
+
50
+ for i, frame in enumerate(frames_to_check):
51
+ # Convert BGR (OpenCV) to RGB (PIL/Transformers)
52
+ # We need to encode it back to bytes for the image service (or refactor image service to accept arrays)
53
+ # To verify keeping it simple, let's encode to jpg bytes
54
+ _, buffer = cv2.imencode('.jpg', frame)
55
+ jpg_as_text = buffer.tobytes()
56
+
57
+ result = image_analysis_service.analyze_image(jpg_as_text)
58
+ frame_scores.append(result['score'])
59
+ frame_details.append({
60
+ "frame_index": i,
61
+ "score": result['score'],
62
+ "verdict": result['verdict']
63
+ })
64
+
65
+ # 4. Aggregate Results
66
+ if not frame_scores:
67
+ return {"score": 0, "verdict": "Could not extract frames"}
68
+
69
+ # Use MAX score as the indicator. If one frame is clearly deepfake, the video is suspect.
70
+ max_score = max(frame_scores)
71
+ avg_score = sum(frame_scores) / len(frame_scores)
72
+
73
+ # Weighted score: Bias towards the Max score
74
+ final_score = (max_score * 0.7) + (avg_score * 0.3)
75
+
76
+ verdict = "Real Video"
77
+ if final_score > 85:
78
+ verdict = "Deepfake/AI"
79
+ elif final_score > 60:
80
+ verdict = "Suspicious"
81
+
82
+ return {
83
+ "score": round(final_score, 2),
84
+ "verdict": verdict,
85
+ "frames_analyzed": len(frames_to_check),
86
+ "frame_details": frame_details
87
+ }
88
+
89
+ except Exception as e:
90
+ print(f"Video Analysis Error: {e}")
91
+ return {"score": 0, "verdict": "Video Analysis Failed"}
92
+
93
+ finally:
94
+ # Cleanup temp file
95
+ if os.path.exists(temp_path):
96
+ os.remove(temp_path)
97
+
98
+ video_analysis_service = VideoAnalysisService()
requirements.txt CHANGED
@@ -1,10 +1,15 @@
1
- fastapi
2
- uvicorn
3
- python-multipart
4
- torch --index-url https://download.pytorch.org/whl/cpu
5
- transformers
6
- scipy
7
- numpy
8
- easyocr
9
- pillow
10
- opencv-python-headless
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ python-multipart>=0.0.9
4
+ torch>=2.2.0
5
+ transformers>=4.40.0
6
+ accelerate>=0.26.0
7
+ numpy==1.26.2
8
+ slowapi==0.1.9
9
+ easyocr==1.7.1
10
+ pillow==10.2.0
11
+ opencv-python-headless==4.8.1.78
12
+ tf-keras==2.15.0
13
+ pydantic-settings>=2.0.0
14
+ protobuf==4.25.3
15
+ sentencepiece