Spaces:

Testys
/

verifacts-backend

Configuration error

App Files Files Community

Testys commited on Dec 12, 2025

Commit

55086fb

1 Parent(s): 5a278ef

feat: adding verifact services with backend code and multi-agent workflow

Browse files

Files changed (29) hide show

.env.example +4 -0
.gitignore +207 -0
CONTRIBUTING.md +150 -0
README.md +1 -11
app/__init__.py +0 -0
app/api/main.py +34 -0
app/api/v1/__init__.py +0 -0
app/api/v1/endpoints.py +66 -0
app/core/__init__.py +0 -0
app/core/cache.py +128 -0
app/core/config.py +54 -0
app/core/models.py +83 -0
app/services/__init__.py +0 -0
app/services/claims/__init__.py +0 -0
app/services/claims/agent.py +223 -0
app/services/claims/tools.py +146 -0
app/services/fact_checker/__init__.py +0 -0
app/services/fact_checker/agent.py +81 -0
app/services/fact_checker/tools.py +84 -0
app/services/identify/__init__.py +0 -0
app/services/identify/agent.py +109 -0
app/services/identify/tools.py +225 -0
app/services/llm_wrapper.py +51 -0
app/services/orchestrator.py +210 -0
app/services/shared_tools.py +46 -0
poetry.lock +0 -0
pyproject.toml +39 -0
requirements.txt +14 -0
tests/test_api.py +102 -0

.env.example ADDED Viewed

	@@ -0,0 +1,4 @@

+GEMINI_API_KEY=your_gemini_api_key_here
+FIRECRAWL_API_KEY=your_firecrawl_api_key_here
+URLSCAN_API_KEY=your_urlscan_api_key_here

.gitignore ADDED Viewed

	@@ -0,0 +1,207 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+.vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,150 @@

+# Contributing to Verifacts Backend
+Welcome to the Verifacts engineering team! This guide will help you set up your development environment and understand our engineering standards.
+## 🚀 Environment Setup
+We use **Poetry** for dependency management to ensure deterministic builds across all micro-modules.
+### 1. Installation
+```bash
+# Install Project Dependencies
+poetry install
+````
+### 2\. Configuration
+Copy the example environment file:
+```bash
+cp .env.example .env
+```
+**Required Variables:**
+  * `OPENAI_API_KEY`: For LLM extraction.
+  * `FIRECRAWL_API_KEY`: For web scraping.
+  * `GOOGLE_FACT_CHECK_KEY`: For verification.
+### 3\. Running the Server
+Start the hot-reloading development server:
+```bash
+poetry run uvicorn app.api.server:app --reload
+```
+-----
+## 🌳 Git Workflow & Branching Strategy
+We follow a strict branching model to keep our codebase stable. **Never push directly to `main`.**
+### Branch Naming Convention
+  * **Features:** `feat/short-description` (e.g., `feat/add-sentiment-node`)
+  * **Bug Fixes:** `fix/short-description` (e.g., `fix/firecrawl-timeout`)
+  * **Documentation:** `docs/short-description` (e.g., `docs/update-api-schema`)
+  * **Chore/Refactor:** `chore/short-description` (e.g., `chore/bump-poetry-version`)
+### The Workflow
+1.  **Sync with Main:**
+    ```bash
+    git checkout main
+    git pull origin main
+    ```
+2.  **Create Branch:**
+    ```bash
+    git checkout -b feat/my-new-feature
+    ```
+3.  **Code & Test:** Write your code and ensure `poetry run pytest` passes.
+4.  **Push & PR:** Push your branch and open a Pull Request (PR) for review.
+-----
+## 📝 Commit Message Standards
+We use **Conventional Commits** to automate our changelogs. Your commit message must look like this:
+`<type>(<scope>): <short summary>`
+### Types
+  * `feat`: A new feature (e.g., adding a new LangGraph node).
+  * `fix`: A bug fix.
+  * `docs`: Documentation only changes.
+  * `style`: Formatting, missing semi-colons, etc. (no code change).
+  * `refactor`: A code change that neither fixes a bug nor adds a feature.
+  * `perf`: A code change that improves performance.
+  * `test`: Adding missing tests.
+  * `chore`: Maintainance tasks (e.g., updating `.gitignore`).
+### Examples
+  * ✅ `feat(graph): add sentiment analysis node to workflow`
+  * ✅ `fix(api): handle 404 error from Firecrawl`
+  * ✅ `docs(readme): update setup instructions for Windows`
+  * ❌ `Fixed the bug` (Too vague)
+  * ❌ `Added new agent` (Missing scope)
+-----
+## 🛠️ How to Add a New Feature (The "Node" Workflow)
+Adding intelligence to Veritas means adding a **Node** to the LangGraph. Follow this 4-step process:
+### Step 1: Create the Logic (The Module)
+Create a new file in `app/graph/nodes/`. It must accept `AgentState` and return a dictionary of updates.
+  * *File:* `app/graph/nodes/sentiment.py`
+  * *Function:* `async def sentiment_node(state: AgentState) -> Dict[str, Any]: ...`
+### Step 2: Update the State
+If your node produces new data (e.g., a "sentiment score"), define it in the shared state.
+  * *File:* `app/graph/state.py`
+  * *Action:* Add `sentiment_score: float` to the `AgentState` TypedDict.
+### Step 3: Register in the Graph
+Wire your new node into the orchestration flow.
+  * *File:* `app/graph/workflow.py`
+  * *Action:*
+    1.  `workflow.add_node("sentiment", sentiment_node)`
+    2.  Define when it runs (e.g., `workflow.add_edge("reader", "sentiment")`).
+### Step 4: Expose via API (Optional)
+If the frontend needs to see this data, update the response model.
+  * *File:* `app/api/v1/models.py` (or `server.py`)
+  * *Action:* Add the field to the Pydantic Response model.
+-----
+## 🧪 Testing Requirements
+Before submitting a PR, ensure you have added tests for your new node.
+```bash
+# Run unit tests
+poetry run pytest
+# Run linting manually (Recommended)
+poetry run ruff check .
+```
+## Pull Request Reviews
+All PRs must be reviewed by at least one other team member. Look for:
+  * Code quality and adherence to standards.
+  * Proper testing coverage.
+  * Clear and descriptive commit messages.
+Thank you for contributing to Verifacts! Your efforts help us build a reliable and intelligent verification platform.

README.md CHANGED Viewed

@@ -1,11 +1 @@
----
-title: Verifacts Backend
-emoji: 🌖
-colorFrom: blue
-colorTo: red
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # verifacts-backend

app/__init__.py ADDED Viewed

File without changes

app/api/main.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from app.api.v1.endpoints import router as v1_router
+from app.core.config import config
+main = FastAPI(
+    title=config.PROJECT_NAME,
+    version=config.VERSION,
+    openapi_url=f"{config.API_PREFIX}/openapi.json"
+)
+main.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+main.include_router(v1_router)
+@main.get("/")
+async def root():
+    return {"message": "Welcome to the Verifacts Backend API!"}
+@main.get("/health")
+async def health_check():
+    return {
+        "status": "operational",
+        "message": "The Verifacts Backend API is running smoothly.",
+        "version": config.VERSION
+    }

app/api/v1/__init__.py ADDED Viewed

File without changes

app/api/v1/endpoints.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import logging
+from fastapi import APIRouter, HTTPException, Depends
+from app.core.models import AnalysisRequest, AnalysisResponse, IdentityData, VerdictData
+from app.core.config import config
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+router = APIRouter(prefix=config.API_PREFIX, tags=["v1"])
+@router.post("/analyze", response_model=AnalysisResponse)
+async def analyze_content(request: AnalysisRequest) -> AnalysisResponse:
+    """
+    Core v1 endpoint to analyze and verify the sources of web contents.
+    Triggers the analysis pipeline and multi-agent Langgraph workflow.
+    """
+    try:
+        initial_state = {
+            "url": str(request.url),
+            "selection": request.selection,
+            "force_refresh": request.force_refresh,
+            "claims": [],
+            "errors": [],
+            "verification_results": [],
+            "extracted_claims": [],
+            "agent_reports": [],
+        }
+        logger.info(f"Starting analysis for URL: {request.url}")
+        final_state = initial_state
+        identity_data = IdentityData(
+            verified=final_state.get("is_verified", False),
+            score=final_state.get("credibility_score", 0.0),
+        )
+        verdict_data = VerdictData(
+            status=final_state.get("verdict_status", "Unverified"),
+            claims_counted=final_state.get("claims_counted", 0),
+            claims_verified=final_state.get("claims_verified", 0),
+            claims_sourced=final_state.get("claims_sourced", 0)
+        )
+        agent_reports = final_state.get("agent_reports", [])
+        formatted_reports = [
+            {
+                "agent": report.get("agent_name", "unknown"),
+                "claims": report.get("output", []),
+                "errors": report.get("errors", [])
+            }
+            for report in agent_reports
+        ]
+        response = AnalysisResponse(
+            status=final_state.get("status", "Completed"),
+            verdict=verdict_data,
+            details={
+                "reports": formatted_reports,
+                "raw_claims": final_state.get("verification_results", [])
+            },
+            identity=identity_data
+        )
+        return response
+    except Exception as e:
+        logger.error(f"Error during analysis: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Analysis of web content failed {str(e)}")

app/core/__init__.py ADDED Viewed

File without changes

app/core/cache.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import logging
+from typing import Any, Optional
+from redis import Redis
+from langchain_core.globals import set_llm_cache
+from langchain_community.cache import RedisCache, RedisSemanticCache
+from langchain_openai import OpenAIEmbeddings
+from app.core.config import config
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+redis_client = Redis.from_url(config.REDIS_URL)
+def init_global_cache(semantic: bool=True) -> None:
+    """Initializes a global Redis cache for LangChain operations."""
+    global redis_client
+    if not redis_client:
+        logger.warning("Redis client is not configured; caching will be disabled.")
+        return
+    if semantic:
+        logger.info("Initializing Redis Semantic Cache with Google Embeddings.")
+        embeddings = OpenAIEmbeddings(
+            model="text-embedding-3-small"
+        )
+        cache = RedisSemanticCache(
+            redis_client=redis_client,
+            embedding_function=embeddings,
+            index_name=config.REDIS_SEMANTIC_INDEX or "langchain_semantic_cache",
+            score_threshold=0.85
+        )
+    else:
+        logger.info("Initializing standard Redis Cache.")
+        cache = RedisCache(redis_client=redis_client)
+    from langchain_core.globals import set_llm_cache
+    set_llm_cache(cache)
+    logger.info("Global Redis cache initialized successfully.")
+    try:
+        # Test the connection
+        redis_client.ping()
+        logger.info("Successfully connected to Redis server.")
+    except Exception as e:
+        logger.error(f"Failed to connect to Redis server: {e}")
+        redis_client = None
+def cache_get(key:str) -> Optional[Any]:
+    """Retrieve a value from the Redis cache by key."""
+    global redis_client
+    if not redis_client:
+        logger.warning("Redis client is not configured; cannot get cache.")
+        return None
+    try:
+        value = redis_client.get(key)
+        if value is not None:
+            logger.info(f"Cache hit for key: {key}")
+        else:
+            logger.info(f"Cache miss for key: {key}")
+        return value
+    except Exception as e:
+        logger.error(f"Error retrieving key {key} from cache: {e}")
+        return None
+def cache_set(key:str, value:Any, ttl:int=config.CACHE_TTL) -> None:
+    """Set a value in the Redis cache with an optional TTL."""
+    global redis_client
+    if not redis_client:
+        logger.warning("Redis client is not configured; cannot set cache.")
+        return
+    try:
+        redis_client.set(name=key, value=value, ex=ttl)
+        logger.info(f"Cache set for key: {key} with TTL: {ttl} seconds")
+    except Exception as e:
+        logger.error(f"Error setting key {key} in cache: {e}")
+def cache_delete(key:str) -> None:
+    """Delete a value from the Redis cache by key."""
+    global redis_client
+    if not redis_client:
+        logger.warning("Redis client is not configured; cannot delete cache.")
+        return
+    try:
+        redis_client.delete(key)
+        logger.info(f"Cache deleted for key: {key}")
+    except Exception as e:
+        logger.error(f"Error deleting key {key} from cache: {e}")
+def cache_stats() -> Optional[dict]:
+    """Retrieve Redis cache statistics."""
+    global redis_client
+    if not redis_client:
+        logger.warning("Redis client is not configured; cannot get stats.")
+        return None
+    try:
+        info = redis_client.info()
+        stats = {
+            "used_memory_human": info.get("used_memory_human"),
+            "keyspace_hits": info.get("keyspace_hits"),
+            "keyspace_misses": info.get("keyspace_misses"),
+            "connected_clients": info.get("connected_clients"),
+            "uptime_in_seconds": info.get("uptime_in_seconds"),
+        }
+        logger.info(f"Redis cache stats: {stats}")
+        return stats
+    except Exception as e:
+        logger.error(f"Error retrieving Redis stats: {e}")
+        return None
+# Usage Example
+# init_global_cache(semantic=True)
+# #ping
+# if __name__ == "__main__":
+#     if not redis_client:
+#         logger.warning("Redis client is not configured; skipping ping.")
+#     if redis_client:
+#         try:
+#             redis_client.ping()
+#             logger.info("Ping to Redis server successful.")
+#         except Exception as e:
+#             logger.error(f"Ping to Redis server failed: {e}")

app/core/config.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from typing import Optional
+from dotenv import load_dotenv
+load_dotenv()  # Load environment variables from a .env file if present
+class Config(BaseSettings):
+    """
+    Application configuration settings.
+    Reads from environment variables by default.
+    """
+    PROJECT_NAME: str = "Verifacts Backend"
+    VERSION: str = "1.0.0"
+    API_PREFIX: str = "/api/v1"
+    SECRET_KEY: str = os.getenv("SECRET_KEY", "default_secret_key")
+    GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
+    LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME", "gemini-2.5-flash")
+    LLM_TEMPERATURE: float = float(os.getenv("LLM_TEMPERATURE", "0"))
+    LLM_MAX_TOKEN: int = int(os.getenv("LLM_MAX_TOKEN", "1024"))
+    FIRECRAWL_API_KEY: Optional[str] = os.getenv("FIRECRAWL_API_KEY")
+    URLSCAN_API_KEY: Optional[str] = os.getenv("URLSCAN_API_KEY")
+    REDIS_URL: str = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+    REDIS_HOST: Optional[str] = os.getenv("REDIS_HOST")
+    REDIS_PORT: Optional[int] = os.getenv("REDIS_PORT")
+    REDIS_PASSWORD: Optional[str] = os.getenv("REDIS_PASSWORD")
+    REDIS_DB: Optional[int] = os.getenv("REDIS_DB")
+    # API Configuration
+    GOOGLE_FACT_CHECK_API_KEY: str = os.getenv("GOOGLE_FACT_CHECK_KEY", "")
+    FACT_CHECK_API_URL: str = (
+        "https://factchecktools.googleapis.com/v1alpha1/claims:search"
+    )
+    TAVILY_API_KEY: Optional[str] = os.getenv("TAVILY_API_KEY")
+    # Performance Settings
+    API_TIMEOUT: int = 2  # seconds
+    MAX_BATCH_SIZE: int = 20
+    # Cache Settings (for future Redis integration)
+    CACHE_ENABLED: bool = True
+    CACHE_TTL: int = 86400  # 24 hours in seconds
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=True
+    )
+config = Config()

app/core/models.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from pydantic import BaseModel, Field, HttpUrl
+from typing import Optional, List, Dict, Any, Literal
+class AnalysisRequest(BaseModel):
+    url: HttpUrl = Field(..., description="The URL of the webpage to analyze.")
+    selection: Optional[str] = Field(
+        None,
+        description="Optional specific text selection from the webpage."
+        )
+    force_refresh: bool = Field(
+        False,
+        description="Whether to force refresh the cached analysis."
+        )
+class IdentityData(BaseModel):
+    verified: bool = Field(..., description="Whether the source is verified.")
+    score: float = Field(..., description="Credibility score of the source (0.0 to 1.0).")
+class VerdictData(BaseModel):
+    status: str = Field(..., description="Verdict status (e.g., true, false, mixed).")
+    claims_counted: int = Field(0, description="Number of claims evaluated.")
+    claims_verified: int = Field(0, description="Number of claims verified as true.")
+    claims_sourced: int = Field(0, description="Number of claims with sources provided.")
+class AnalysisResponse(BaseModel):
+    status: str = Field(..., description="Status of the analysis request.")
+    verdict: VerdictData = Field(..., description="Detailed verdict data.")
+    identity: IdentityData = Field(..., description="Identity verification data of the source.")
+    details: Dict[str, Any] = Field(..., description="Detailed agent reports and findings.")
+class Provenance(BaseModel):
+    source: Literal["selection", "extracted", "user_provided"] = Field(..., description="Source of the claim.")
+    url: Optional[HttpUrl] = Field(None, description="URL from which the claim was extracted, if applicable.")
+    context: Optional[str] = Field(None, description="Contextual information about the claim.")
+class Claim(BaseModel):
+    claim_id: str
+    text: str = Field(..., description="The atomic factual claim statement")
+    normalized_text: Optional[str] = Field(None, description="Normalized version of the claim text.")
+    provenance: Provenance = Field(..., description="Provenance information of the claim.")
+    confidence: Optional[float] = Field(None, description="Confidence score of claim extraction (0.0 to 1.0).")
+    claim_type: Literal["factual", "opinion", "mixed", "ambiguous"] = Field(..., description="Type of the claim.")
+class CredibilityVerdict(BaseModel):
+    trust_level: str = Field(..., description="Overall trust level of the source (e.g., high, medium, low).")
+    score: float = Field(..., description="Credibility score of the source (0-100).")
+    red_flags: List[str] = Field(..., description="List of identified red flags affecting credibility.")
+    summary: str = Field(..., description="Summary of the credibility assessment.")
+    source_used: list[str] = Field(..., description="List of sources used in the credibility assessment.")
+class FactCheckVerdict(BaseModel):
+    """Result for a single claim verification"""
+    claim: str = Field(..., description="The factual claim being verified")
+    verdict: str = Field(..., description="verified | debunked | mixture | unverified")
+    textual_rating: Optional[str] = Field(None, description="Textual rating from the fact-checker")
+    corroboration_url: Optional[str] = Field(None, description="URL to the fact-check source")
+    fact_checker: Optional[str] = Field(None, description="Name of the fact-checking organization")
+    checked_date: Optional[str] = None
+class VerifyResponse(BaseModel):
+    """Response model for /verify endpoint"""
+    status: str  # "success" or "error"
+    mode: str  # "granular" or "full"
+    data: dict
+# === Final Output Schema ===
+class FinalReport(BaseModel):
+    url: str = Field(..., description="Original URL")
+    credibility: Dict = Field(..., description="Source credibility assessment")
+    claims: List[str] = Field(..., description="Extracted factual claims")
+    fact_checks: List[Dict] = Field(..., description="Fact-check verdicts per claim")
+    search_insights: List[Dict] = Field(default=[], description="Tavily search results with snippets for enrichment")
+    overall_verdict: str = Field(..., description="Final truth rating: verified | debunked | mixture | unverified")
+    summary: str = Field(..., description="One-paragraph overall summary")
+    sources: List[str] = Field(default=[], description="Key corroborating URLs")

app/services/__init__.py ADDED Viewed

File without changes

app/services/claims/__init__.py ADDED Viewed

File without changes

app/services/claims/agent.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import logging
+import uuid
+from typing import List, Dict, Any, Optional
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from pydantic import BaseModel, Field
+from app.services.llm_wrapper import llm_wrapper
+from app.services.claims.tools import ClaimTools
+from app.core.models import Claim, Provenance
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class ExtractedClaimItem(BaseModel):
+    text: str = Field(..., description="The extracted claim text.")
+    type: str = Field(..., description="The type of the claim (factual, opinion, etc.).")
+class ClaimsList(BaseModel):
+    claims: List[ExtractedClaimItem] = Field(..., description="List of extracted claims.")
+class ClaimExtractionAgent:
+    """
+    Agent 2: Claim Extraction Agent.
+    Roles:
+    1. Decide strategy (Passthrough vs Atomization).
+    2. Call Scraping Tools if needed.
+    3. Use LLM to extract and classify claims.
+    """
+    def __init__(self):
+        self.llm = llm_wrapper.get_llm()
+        self.output_parser = JsonOutputParser(pydantic_object=ClaimsList)
+        self.tools = ClaimTools()
+    async def run(self, verdict:Optional[Dict] = None) -> List[Claim]:
+        """
+        Main method to run the Claim Extraction Agent.
+        """
+        text_to_process = ""
+        source_type = "selection"
+        context_url = verdict.get("url") if verdict else None
+        selection = verdict.get("selection") if verdict else None
+        url = context_url
+        cleaned_bg = ""  # Initialize here to avoid 'not defined' errors
+        if selection:
+            logger.info("Using user-provided text selection for claim extraction.")
+            text_to_process = selection
+            clean_sel, _ = self.tools.sanitize_text(selection, max_length=5000)
+            if self.tools.looks_like_propmpt_injection(clean_sel):
+                logger.warning("Potential prompt injection detected in user selection.")
+                return [self._create_ambiguous_claim("Potential prompt injection detected in user selection.", url, source_type)]
+            text_to_process = clean_sel
+            if url:
+                try:
+                    logger.info(f"Fetching background context from FireCrawl for URL: {url}")
+                    full_page_text = await self.tools.scrape_article_text.ainvoke(url)
+                    if full_page_text:
+                        context_snippet, _ = self.tools.sanitize_text(full_page_text, max_length=2000)
+                        cleaned_bg = context_snippet.replace("\n", " ")
+                        logger.info("Successfully fetched background context for selection.")
+                except Exception as e:
+                    logger.warning(f"Failed to fetch background context from FireCrawl: {str(e)}")
+        elif url:
+            logger.info(f"No text selection provided, scraping article text from {url}.")
+            scraped_text = await self.tools.scrape_article_text.ainvoke(url)
+            if not scraped_text:
+                logger.warning("No text could be extracted from the article.")
+                return [self._create_ambiguous_claim("No text could be extracted from the article.", url, "extracted")]
+            text_to_process = scraped_text
+            source_type = "extracted"
+        if not text_to_process:
+            logger.error("No text available for claim extraction after processing.")
+            return [self._create_ambiguous_claim("No text available for claim extraction.", url, source_type)]
+        is_short_selection = len(text_to_process.split()) < 50
+        has_complexity = " and " in text_to_process.lower() or ";" in text_to_process or "," in text_to_process
+        should_atomize = (source_type == "extracted") or (has_complexity and cleaned_bg != "")
+        if should_atomize and self.llm:
+            # Fixed: Correct argument order matching method signature
+            return await self._atomize_and_extract_claims(
+                text=text_to_process,
+                url=url,
+                source=source_type,  # This is the source type (selection/extracted)
+                source_type=source_type,
+                context=cleaned_bg
+            )
+        else:
+            return [self._create_ambiguous_claim(text_to_process, url, source_type)]
+    async def _atomize_and_extract_claims(
+        self,
+        text: str,
+        url: Optional[str],
+        source: str,
+        source_type: str,
+        context: Optional[str] = None
+    ) -> List[Claim]:
+        """
+        Atomizes the text into multiple claims using the LLM.
+        """
+        context_instruction = ""
+        if source == "selection" and context:
+            context_instruction = (
+                f"CONTEXT INFO:\n"
+                f"The user selected the text below from a webpage ({url or 'unknown'}).\n"
+                f"Here is a snippet of the page content to help you understand the topic:\n"
+                f"--- BEGIN CONTEXT ---\n{context}\n--- END CONTEXT ---\n"
+                f"Use this context to resolve ambiguities (e.g. what 'it' refers to), but ONLY extract claims from the 'USER SELECTION'."
+            )
+        elif source == "selection" and url:
+            context_instruction = f"SOURCE URL: {url}. Use the domain to infer the likely topic if needed."
+        elif source == "extracted":
+            context_instruction = f"SOURCE URL: {url or 'unknown'}. Use the domain to infer the likely topic if needed."
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", "You are an expert fact-checker. "
+                       "Your task is to extract distinct, checkable factual claims from the provided text.\n"
+                       "Rules:\n"
+                       "1. Split compound statements (e.g. 'X is true and Y is false' -> [X, Y]).\n"
+                       "2. Ignore pure opinions or rhetorical questions.\n"
+                       "3. Keep claims concise and self-contained.\n"
+                       "{context_instruction}\n\n"
+                       "{format_instructions}"),
+            ("user", "USER SELECTION to analyze:\n{text}")
+        ])
+        chain = prompt | self.llm | self.output_parser
+        try:
+            result = await chain.ainvoke({
+                "text": text,
+                "context_instruction": context_instruction,
+                "format_instructions": self.output_parser.get_format_instructions()
+            })
+            logger.info(f"Successfully extracted claims using atomization {result}.")
+            claims = []
+            # Handle both dict and list responses from the parser
+            claims_list = result.get("claims", []) if isinstance(result, dict) else result
+            for item in claims_list:
+                if isinstance(item, dict):
+                    claim_text = item.get("text", str(item))
+                    claim_type = item.get("type", "factual")
+                else:
+                    claim_text = str(item)
+                    claim_type = "factual"
+                claims.append(Claim(
+                    claim_id=str(uuid.uuid4()),
+                    text=claim_text,
+                    normalized_text=claim_text.lower().strip(),
+                    claim_type=claim_type,
+                    provenance=Provenance(
+                        source=source_type,
+                        url=url,
+                        context=context_instruction[:200] + "..." if context_instruction else None,
+                    ),
+                    confidence=0.9 if claim_type == "factual" else 0.6
+                ))
+            logger.info(f"Extracted {len(claims)} claims using atomization.")
+            return claims
+        except Exception as e:
+            logger.error(f"Error during claim atomization and extraction: {str(e)}")
+            # Ensure source_type has a valid value for Provenance
+            valid_source_type = source_type if source_type in ("selection", "extracted", "user_provided") else "extracted"
+            return [self._create_ambiguous_claim("Error during claim extraction.", url, valid_source_type)]
+    def _create_ambiguous_claim(self, text: str, url: Optional[str], source_type: str) -> Claim:
+        """Fallback to create an ambiguous claim when extraction fails."""
+        # Ensure source_type has a valid value
+        valid_source_type = source_type if source_type in ("selection", "extracted", "user_provided") else "extracted"
+        return Claim(
+            claim_id=str(uuid.uuid4()),
+            text=text,
+            normalized_text=text.lower().strip(),
+            claim_type="ambiguous",
+            provenance=Provenance(
+                source=valid_source_type,
+                url=url,
+                context=text[:100] + "..." if text else None
+            ),
+            confidence=0.0
+        )
+# Example Usage:
+async def main():
+    verdict = {'url': 'https://databackedafrica.com/', 'trust_level': 'medium-high', 'score': 80, 'red_flags': ['Brand new TLS certificate (3 days'], 'summary': None, 'source_used': ['https://databackedafrica.com/']}
+    agent = ClaimExtractionAgent()
+    claims = await agent.run(verdict)
+    for claim in claims:
+        print(f"Claim ID: {claim.claim_id}, Text: {claim.text}, Type: {claim.claim_type}, Confidence: {claim.confidence}")
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())

app/services/claims/tools.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import logging
+import re
+from typing import List, Dict, Any, Tuple, Optional
+from langchain_core.tools import tool
+from langchain_community.document_loaders.firecrawl import FireCrawlLoader
+from app.core.config import config
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class ClaimTools:
+    """
+    A collection of tools for fetching, extracting and cleaning texts
+    for the claim extraction agent.
+    """
+    @staticmethod
+    def sanitize_text(text: str, max_length: Optional[int] = None) -> Tuple[str, ...]:
+        """
+        Cleans and sanitizes the input text by removing unwanted characters,
+        excessive whitespace, and truncating to max_length if specified.
+        Args:
+            text (str): The input text to sanitize.
+            max_length (Optional[int]): Maximum length of the sanitized text.
+        Returns:
+            (cleaned_text, was_truncated): A tuple containing the cleaned text and a boolean indicating if truncation occurred.
+        """
+        if not text:
+            return "", False
+        text = text.replace("\u200b", " ").replace("\ufeff", "")  # Remove zero-width spaces and BOM
+        cleaned = text.replace("\r\n", "\n").replace("\r", "\n")
+        cleaned = " ".join(cleaned.split())  # Collapse multiple spaces/newlines
+        was_truncated = False
+        if max_length and len(cleaned) > max_length:
+            cleaned = cleaned[:max_length]
+            was_truncated = True
+        return cleaned, was_truncated
+    @staticmethod
+    @tool("scrape_article_text")
+    async def scrape_article_text(url: str) -> str:
+        """
+        Extracts the main body text from an article given its URL.
+        Useful when user provides a URL without specific text selection.
+        """
+        # try:
+        #     from newspaper import Article
+        #     logger.info(f"Scraping article text from URL: {url}")
+        #     article = Article(url)
+        #     article.download()
+        #     if article.download_state == 2:  # Downloaded
+        #         article.parse()
+        #         text = article.text or " "
+        #     if text or len(text.strip()) >= 50:
+        #         logger.warning(f"No text extracted from article at URL: {url}")
+        #         return text.strip()
+        #     logger.info("Article text seems insufficient, attempting to use Newspaper3k's NLP.")
+        # except Exception as e:
+        #     logger.error(f"Error scraping article text from {url}: {str(e)}")
+        #     return ""
+        if not config.FIRECRAWL_API_KEY:
+            logger.error("FIRECRAWL_API_KEY not set. Cannot use FireCrawl for extraction.")
+            return ""
+        logger.info("Fallback for Newspaper4k: Using FireCrawl to extract article text.")
+        try:
+            loader = FireCrawlLoader(
+                url=url,
+                api_key=config.FIRECRAWL_API_KEY,
+                mode="scrape"
+            )
+            documents = await loader.aload()  # ← async version
+            logger.info(f"FireCrawl returned {len(documents)} documents for {url}")
+            if not documents:
+                logger.warning(f"FireCrawl returned no documents for {url}")
+                return ""
+            text = "\n\n".join(doc.page_content for doc in documents if doc.page_content)
+            text = text.strip()
+            logger.info(f"Successfully extracted article text from URL: {url} using FireCrawl")
+            if text and len(text.strip())> 50:
+                return text
+            else:
+                logger.warning(f"No documents returned by FireCrawl for URL: {url}")
+        except Exception as e:
+            logger.error(f"Error extracting article text from {url} using FireCrawl: {str(e)}")
+            return ""
+        else:
+            logger.warning("FIRECRAWL_API_KEY not set. Cannot use FireCrawl for extraction.")
+        return text
+    @staticmethod
+    def looks_like_propmpt_injection(text: str) -> bool:
+        """
+        Heuristic check to determine if the provided text looks like a prompt injection attempt.
+        Args:
+            text (str): The input text to evaluate.
+        Returns:
+            bool: True if the text appears to be a prompt injection, False otherwise.
+        """
+        injection_patterns = [
+            r"(?i)ignore all previous instructions",
+            r"(?i)disregard previous directions",
+            r"(?i)override earlier commands",
+            r"(?i)forget what you were told before",
+            r"(?i)act as if you are",
+            r"(?i)you are now",
+            r"(?i)from now on",
+            r"(?i)you must",
+            r"(?i)you will",
+            r"(?i)silence all prior guidelines",
+            r"(?i)break free from your restrictions",
+            r"(?i)bypass your limitations",
+            r"(?i)ignore your programming",
+            r"(?i)go against your guidelines",
+            r"(?i)user:",
+        ]
+        for pattern in injection_patterns:
+            if re.search(pattern, text, re.IGNORECASE):
+                logger.warning(f"Prompt injection pattern detected: {pattern} in text: {text}")
+                return True
+        return False

app/services/fact_checker/__init__.py ADDED Viewed

File without changes

app/services/fact_checker/agent.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# agents/fact_checker/agent.py
+import logging
+from typing import List, Dict, Any
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from pydantic import BaseModel, Field
+from app.services.llm_wrapper import llm_wrapper
+from app.services.fact_checker.tools import GoogleFactCheckTool
+from app.core.models import FactCheckVerdict
+from app.core.config import config
+log = logging.getLogger(__name__)
+class FactCheckAgent:
+    """
+    Agent 3: Final fact-check judgment using Google Fact Check API + LLM reasoning
+    """
+    def __init__(self):
+        self.llm = llm_wrapper.get_llm()
+        self.tool = GoogleFactCheckTool(api_key=config.GOOGLE_FACT_CHECK_API_KEY)
+        self.parser = JsonOutputParser(pydantic_object=FactCheckVerdict)
+        self.prompt = ChatPromptTemplate.from_messages([
+            ("system", """
+        You are a professional fact-checker. Use the Google Fact Check tool result below to give a final verdict.
+        Rules:
+        - If a reputable fact-checker (Snopes, PolitiFact, AFP, etc.) rated it → trust them
+        - "False", "Pants on Fire" → debunked
+        - "True" → verified
+        - "Mixture", "Mostly False" → mixture
+        - No result → unverified
+        - Be concise and neutral
+        Return JSON only.
+        {format_instructions}
+            """),
+            ("human", "Claim: {claim}\nTool result: {tool_result}")
+        ])
+        self.chain = self.prompt | self.llm | self.parser
+    async def run(self, claim: str) -> Dict[str, Any]:
+        log.info(f"FactCheckAgent verifying: {claim[:60]}...")
+        # Step 1: Use tool to get raw fact-check data
+        raw_result = await self.tool._search(claim)
+        tool_output = str(raw_result)
+        # Step 2: LLM makes final reasoned verdict
+        try:
+            verdict = await self.chain.ainvoke({
+                "claim": claim,
+                "tool_result": tool_output,
+                "format_instructions": self.parser.get_format_instructions()
+            })
+            return {
+                "agent": "fact_checker",
+                "claim": claim,
+                "verdict": verdict,
+                "raw_tool_result": raw_result,
+            }
+        except Exception as e:
+            log.error(f"LLM failed in FactCheckAgent: {e}")
+            return {
+                "agent": "fact_checker",
+                "claim": claim,
+                "verdict": {
+                    "verdict": "unverified",
+                    "confidence": 0.1,
+                    "explanation": "Fact-check processing failed",
+                    "sources": []
+                },
+                "error": str(e)
+            }

app/services/fact_checker/tools.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# agents/fact_checker/tool.py
+import asyncio
+import hashlib
+import logging
+from typing import Dict, List, Optional
+import aiohttp
+from langchain_core.tools import tool
+from app.core.config import config
+log = logging.getLogger(__name__)
+class GoogleFactCheckTool:
+    """LangChain tool that verifies claims using Google Fact Check Tools API"""
+    def __init__(self, api_key: str):
+        self.api_key = api_key or config.GOOGLE_FACT_CHECK_KEY
+        self.base_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
+        self.cache: Dict[str, dict] = {}
+    def _hash(self, claim: str) -> str:
+        return hashlib.sha256(claim.lower().strip().encode()).hexdigest()
+    async def _search(self, claim: str) -> dict:
+        if cached := self.cache.get(self._hash(claim)):
+            return cached
+        if not self.api_key:
+            return {"status": "error", "reason": "API key missing"}
+        params = {"query": claim, "key": self.api_key, "languageCode": "en"}
+        try:
+            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10)) as session:
+                async with session.get(self.base_url, params=params) as resp:
+                    data = await resp.json() if resp.status == 200 else {}
+                    result = self._parse(data.get("claims", []), claim)
+                    self.cache[self._hash(claim)] = result
+                    return result
+        except Exception as e:
+            log.warning(f"Fact-check API error: {e}")
+            return {"status": "unverified", "reason": "API error"}
+    def _parse(self, claims: List[dict], original: str) -> dict:
+        if not claims:
+            return {
+                "status": "unverified",
+                "claim": original,
+                "reason": "No fact-checks found",
+            }
+        review = claims[0].get("claimReview", [{}])[0]
+        rating = review.get("textualRating", "").lower()
+        status_map = {
+            "false": "debunked", "pants": "debunked", "incorrect": "debunked",
+            "true": "verified", "accurate": "verified",
+            "mixture": "mixture", "half": "mixture", "mostly": "mixture",
+        }
+        status = next((v for k, v in status_map.items() if k in rating), "unverified")
+        return {
+            "status": status,
+            "claim": original,
+            "textual_rating": review.get("textualRating"),
+            "source_url": review.get("url"),
+            "fact_checker": review.get("publisher", {}).get("name"),
+            "review_date": review.get("reviewDate"),
+        }
+    # LangChain Tool
+    @tool("google_fact_check")
+    async def google_fact_check(self, claim: str) -> str:
+        """
+        Use this tool to verify factual claims against professional fact-checkers.
+        Input: A single factual claim (e.g., "The Earth is flat")
+        Output: Verification result with source
+        """
+        result = await self._search(claim)
+        if result["status"] in ["verified", "debunked"]:
+            return f"Fact-check result: {result['textual_rating']} by {result['fact_checker']}. Source: {result['source_url']}"
+        return f"No reliable fact-check found for: {claim}"

app/services/identify/__init__.py ADDED Viewed

File without changes

app/services/identify/agent.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import json
+import logging
+import uuid
+from typing import Dict, Any, Optional
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from pydantic import BaseModel, Field
+from app.services.identify.tools import SourceCredibilityTool
+from app.services.llm_wrapper import llm_wrapper
+from app.core.config import config
+from app.core.models import CredibilityVerdict
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class SourceCredibilityAgent:
+    """
+    Agent responsible for assessing the credibility of a source URL.
+    Uses raw tools to gather data and an LLM to analyze and produce a verdict.
+    """
+    def __init__(self):
+        self.llm = llm_wrapper.get_llm()
+        self.tool = SourceCredibilityTool()
+        self.output_parser = JsonOutputParser(
+            pydantic_object=CredibilityVerdict
+        )
+        self.prompt = ChatPromptTemplate.from_messages([
+                    ("system", """
+        You are a senior fact-checking analyst specializing in source credibility evaluation.
+        Using the technical signals below, produce a final credibility verdict.
+        Guidelines:
+        - Be strict: new domains (<6 months), no SSL history, or malicious verdicts → very_low
+        - Established domains (>3 years), clean records → high
+        - Heavy trackers/ads + obscure ASN → downgrade
+        - Never trust sites flagged by Google Safe Browsing or urlscan.io as malicious
+        - Bias: infer only if strong patterns (e.g., known partisan ASN or domain name)
+        - BE CONCISE in your final verdict summary.
+        - BE CONSISTENT between trust_level and score.
+        Return valid JSON only.
+        {format_instructions}
+                    """.strip()),
+                    ("human", "Assess credibility of this source:\n\n{report_json}")
+                ])
+        self.chain = self.prompt | self.llm | self.output_parser
+    async def run(self, url: str) -> CredibilityVerdict:
+        """
+        Main method to run the Source Credibility Agent.
+        Args:
+            url (str): The URL of the source to assess.
+        Returns:
+            CredibilityVerdict: The credibility verdict of the source.
+        """
+        logger.info(f"Assessing credibility for URL: {url}")
+        output_report = await self.tool.check_source_credibility.ainvoke(url)
+        try:
+            # logger.info(f"Generating credibility verdict using LLM using prompt: {self.prompt}.")
+            verdict = await self.chain.ainvoke({
+                "report_json": json.dumps(output_report, indent=2),
+                "format_instructions": self.output_parser.get_format_instructions()
+            })
+            # logger.info(f"Generated verdict: {verdict}")
+            final_verdict = {
+                "url": url,
+                "trust_level": verdict.get("trust_level"),
+                "score": verdict.get("score"),
+                "red_flags": verdict.get("red_flags"),
+                "summary": verdict.get("summary"),
+                "source_used": verdict.get("source_used") if verdict.get("source_used") else [url]
+            }
+            # logger.info(f"Credibility verdict for {url}: {final_verdict}")
+            return final_verdict
+        except Exception as e:
+            logger.error(f"Error generating credibility verdict for {url}: {str(e)}")
+            return {
+                "url": url,
+                "trust_level": "unknown",
+                "score": 0.0,
+                "red_flags": ["error_generating_verdict"],
+                "summary": "Could not generate credibility verdict due to an error.",
+                "source_used": [url]
+            }
+# # Example usage:
+# async def main():
+#     url = "https://databackedafrica.com/"
+#     agent = SourceCredibilityAgent()
+#     verdict = await agent.run(url)
+#     print(f"Credibility Verdict: {verdict}")
+# if __name__ == "__main__":
+#     import asyncio
+#     asyncio.run(main())

app/services/identify/tools.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import whois
+import tldextract
+import aiohttp
+import datetime
+import re
+import asyncio
+from urllib.parse import urlparse
+from typing import Optional, Dict, Any
+import os
+from dotenv import load_dotenv
+from langchain_core.tools import tool
+load_dotenv()
+from app.core.config import config
+# class Config:
+#     GOOGLE_APIS_KEY: Optional[str] = os.getenv("GOOGLE_APIS_KEY")
+#     FIRECRAWL_API_KEY: Optional[str] = os.getenv("FIRECRAWL_API_KEY")
+#     URLSCAN_API_KEY: Optional[str] = os.getenv("URLSCAN_API_KEY")
+# config = Config()
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class SourceCredibilityTool:
+    """
+    A collection of tools for verifying sources URLs.
+    """
+    @staticmethod
+    def extract_domain(url: str) -> str:
+        """
+        Extract the domain from a given URL.
+        """
+        extracted = tldextract.extract(url)
+        logger.info(f"Extracted components: {extracted}")
+        if not extracted.suffix:
+            logger.warning(f"No suffix found for URL: {url}")
+            return "unknown"
+        domain = f"{extracted.domain}.{extracted.suffix}"
+        logger.info(f"Extracted domain: {domain}")
+        return domain
+    @staticmethod
+    async def _submit_to_urlscan(url: str) -> Optional[str]:
+        """
+        Submit a URL to urlscan.io for analysis and return the scan ID.
+        """
+        api_key = config.URLSCAN_API_KEY
+        if not api_key:
+            logger.error("URLSCAN_API_KEY is not set in the environment variables.")
+            return None
+        submit_url = "https://urlscan.io/api/v1/scan/"
+        headers = {
+            'Content-Type': 'application/json',
+            'API-Key': api_key,
+        }
+        # logger.info(f"Headers for urlscan.io submission: {headers}")
+        data = {
+            'url': url,
+            'visibility': 'public'
+        }
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(submit_url, json=data, headers=headers) as response:
+                    if response.status == 200:
+                        resp_json = await response.json()
+                        scan_id = resp_json.get('uuid')
+                        result_url = f"https://urlscan.io/api/v1/result/{scan_id}/"
+                        # logger.info(f"Submitted URL to urlscan.io: {data.get("result") or result_url}")
+                        return data.get("result") or result_url
+                    else:
+                        text = await response.text()
+                        logger.error(f"Failed to submit URL to urlscan.io, status code: {response.status} {text}")
+                        return None
+        except aiohttp.ClientError as e:
+            logger.error(f"Error submitting URL to urlscan.io: {e}")
+            return None
+    @staticmethod
+    async def _fetch_urlscan_result(result_url: str) -> Optional[Dict[str, Any]]:
+        """
+        Fetch the result of a urlscan.io analysis.
+        """
+        api_key = config.URLSCAN_API_KEY
+        if not api_key:
+            logger.error("URLSCAN_API_KEY is not set in the environment variables.")
+            return None
+        headers = {
+            'API-Key': api_key,
+        }
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(result_url, headers=headers) as response:
+                    if response.status == 200:
+                        resp_json = await response.json()
+                        # logger.info(f"Fetched urlscan.io result from: {result_url}")
+                        return resp_json
+                    else:
+                        text = await response.text()
+                        logger.error(f"Failed to fetch urlscan.io result, status code: {response.status} {text}")
+                        return None
+        except aiohttp.ClientError as e:
+            logger.error(f"Error fetching urlscan.io result: {e}")
+            return None
+    def extract_credibility_signals(urlscan_result: Dict[str, Any]) -> Dict[str, Any]:
+        data = urlscan_result
+        page = data.get("page", {})
+        stats = data.get("stats", {})
+        verdicts = data.get("verdicts", {})
+        task = data.get("task", {})
+        lists = data.get("lists", {})
+        return {
+            "url": task.get("url"),
+            "scan_date": task.get("time"),
+            "screenshot_url": task.get("screenshotURL"),
+            # Critical verdicts
+            "malicious_detected": verdicts.get("overall", {}).get("malicious", False),
+            "engine_detections": verdicts.get("engines", {}).get("maliciousTotal", 0),
+            "suspicious_categories": verdicts.get("overall", {}).get("categories", []),
+            # Domain & TLS age
+            "domain_age_days": page.get("apexDomainAgeDays", 0),
+            "tls_age_days": page.get("tlsAgeDays", 0),
+            "is_new_domain": page.get("apexDomainAgeDays", 9999) < 180,
+            "is_brand_new_tls": page.get("tlsAgeDays", 9999) < 60,
+            # Security posture
+            "secure_percentage": stats.get("securePercentage", 100),
+            "uses_mixed_content": stats.get("securePercentage", 100) < 98,
+            # Hosting
+            "server": page.get("server"),
+            "asn": page.get("asn"),
+            "asn_name": page.get("asnname"),
+            "ip": page.get("ip"),
+            # Privacy / trackers (approximate)
+            "total_requests": sum(s.get("count", 0) for s in stats.get("resourceStats", [])),
+            "third_party_domains": len(lists.get("domains", [])) - 1,
+            # Suspicious patterns
+            "has_data_urls": any("data:" in r.get("request", {}).get("url", "") for r in data.get("data", {}).get("requests", [])),
+            "redirects_to_suspicious": any(
+                tldextract.extract(url).domain in ["bit", "tinyurl"] or tldextract.extract(url).suffix in ["ru", "xyz", "top"]
+                for url in lists.get("linkDomains", [])
+            ),
+            # Bonus: popularity
+            "umbrella_rank": next(
+                (item["rank"] for item in data.get("meta", {}).get("processors", {}).get("umbrella", {}).get("data", []) if item["hostname"] == page.get("domain")),
+                None
+            ),
+        }
+    @staticmethod
+    @tool("check_source_credibility")
+    async def check_source_credibility(url: str) -> Dict[str, Any]:
+        """
+        Check the credibility of a source URL using urlscan.io.
+        Returns a dictionary with credibility information.
+        """
+        result = {
+            "url": url,
+            "domain": SourceCredibilityTool.extract_domain(url),
+            "urlscan_result": None,
+            "verdict": None,
+            "is_malicious": None,
+            "suspicious": None,
+            "categories": []
+        }
+        result_url = await SourceCredibilityTool._submit_to_urlscan(url)
+        if not result_url:
+            logger.error(f"Could not submit URL to urlscan.io: {url}")
+            return result
+        urlscan_data = None
+        if result_url:
+            for _ in range(10):  # Retry up to 10 times
+                await asyncio.sleep(5)  # Wait before retrying
+                urlscan_data = await SourceCredibilityTool._fetch_urlscan_result(result_url)
+                if urlscan_data:
+                    break
+        urlscan_insights = {}
+        if urlscan_data:
+            result["urlscan_result"] = urlscan_data
+            credibitility_signals = SourceCredibilityTool.extract_credibility_signals(urlscan_data)
+            urlscan_insights.update(credibitility_signals)
+        return urlscan_insights
+# # # Example usage:
+# async def main():
+#     url = "https://bit.ly/3X9kP2m/"
+#     identifier = SourceCredibilityTool()
+#     domain = identifier.extract_domain(url)
+#     print(f"Extracted domain: {domain}")
+#     credibility = await identifier.check_source_credibility.ainvoke(url)
+#     print(f"Source credibility report: {credibility}")
+# if __name__ == "__main__":
+#     import asyncio
+#     asyncio.run(main())

app/services/llm_wrapper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import logging
+from typing import List, Dict, Any, Optional
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from dotenv import load_dotenv
+from app.core.config import config
+load_dotenv()  # Load environment variables from a .env file if present
+class LLMWrapper:
+    """
+    Centralized LLM Wrapper for the Verifacts System.
+    Standardizes model configurations, message formatting, and response handling.
+    """
+    _instance = None
+    def __init__(self):
+        self.model_name = config.LLM_MODEL_NAME
+        self.temperature = config.LLM_TEMPERATURE
+        self.max_tokens = config.LLM_MAX_TOKEN
+        self.api_key = config.GEMINI_API_KEY
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY is not set in the environment variables.")
+            self.llm = None
+        self.llm = ChatGoogleGenerativeAI(
+            model=self.model_name,
+            temperature=self.temperature,
+            max_output_tokens=self.max_tokens,
+            api_key=self.api_key
+        )
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+    def get_llm(self):
+        """Returns the underlying LLM instance."""
+        return self.llm
+llm_wrapper = LLMWrapper.get_instance()

app/services/orchestrator.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import logging
+import asyncio
+from typing import Dict, TypedDict, Annotated, List
+from langchain_core.runnables import Runnable
+from langgraph.graph import StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver  # For state persistence
+from redis import Redis  # pip install redis
+from langchain_community.cache import RedisCache
+from app.services.identify.agent import SourceCredibilityAgent
+from app.services.claims.agent import ClaimExtractionAgent
+from app.services.fact_checker.agent import FactCheckAgent
+from app.core.config import config
+from app.services.shared_tools import tavily_search
+from app.services.llm_wrapper import llm_wrapper
+from langchain_core.prompts import ChatPromptTemplate
+from app.core.models import FinalReport
+from langchain_core.output_parsers import JsonOutputParser
+from langgraph.checkpoint.memory import MemorySaver
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class WorkflowState(TypedDict):
+    url: str
+    selection: str
+    credibility: Annotated[Dict, "Source credibility report"]
+    claims: Annotated[List[Dict], "Extracted claims"]
+    fact_checks: Annotated[List[Dict], "Fact check verdicts"]
+    search_insights: Annotated[List[Dict], "Tavily search results with snippets for enrichment"]
+    error: Annotated[str, "Error message, if any"]
+# === Agent Nodes ===
+async def credibility_node(state: WorkflowState) -> WorkflowState:
+    agent = SourceCredibilityAgent()
+    try:
+        url = state.get("url")
+        if not url:
+            state["error"] = "No URL provided for credibility check"
+            return state
+        report = await agent.run(url)  # Make sure agent.run() accepts url as string
+        state["credibility"] = report
+        logger.info(f"Credibility report: {report}")
+        trust_level = report.get("trust_level", "unknown")
+        if trust_level in ["low", "very_low"]:
+            state["error"] = "Source credibility too low to proceed"
+    except Exception as e:
+        logger.error(f"Credibility check error: {str(e)}")
+        state["error"] = f"Credibility check failed: {str(e)}"
+    return state
+async def extraction_node(state: WorkflowState) -> WorkflowState:
+    if state.get("error"):
+        return state  # Skip if previous error
+    agent = ClaimExtractionAgent()
+    try:
+        # Build verdict dict from state to pass to agent
+        verdict = {
+            "url": state.get("url"),
+            "selection": state.get("selection"),
+            "trust_level": state.get("credibility", {}).get("trust_level"),
+            "score": state.get("credibility", {}).get("score"),
+        }
+        claims = await agent.run(verdict)  # Pass verdict to agent
+        logger.info(f"Extracted {len(claims)} claims")
+        state["claims"] = [c.text for c in claims if c.claim_type == "factual"]
+    except Exception as e:
+        logger.error(f"Claim extraction error: {str(e)}")
+        state["error"] = f"Claim extraction failed: {str(e)}"
+    return state
+async def factcheck_node(state: WorkflowState) -> WorkflowState:
+    if state.get("error") or not state.get("claims"):
+        return state  # Skip if previous error or no claims
+    agent = FactCheckAgent()
+    try:
+        fact_checks = []
+        for claim in state["claims"]:
+            result = await agent.run(claim)
+            logger.info(f"Fact-check result for claim '{claim[:30]}...': {result}")
+            fact_checks.append(result)
+        state["fact_checks"] = fact_checks
+    except Exception as e:
+        state["error"] = f"Fact-checking failed: {str(e)}"
+    return state
+# === NEW: Tavily Enrichment (Always runs after extraction) ===
+async def search_enrichment_node(state: WorkflowState) -> WorkflowState:
+    if state.get("error") or not state.get("claims"): return state
+    insights = []
+    for claim in state["claims"]:
+        try:
+            query = f"fact check: {claim} site:reputable"
+            results = await tavily_search.ainvoke(query=query, max_results=3)
+            insights.append({
+                "claim": claim,
+                "results": results,  # Includes snippets, answers, sources
+                "sources": [r["url"] for r in results]
+            })
+        except Exception as e:
+            logger.warning(f"Tavily failed for claim '{claim}': {e}")
+    state["search_insights"] = insights
+    return state
+# === NEW: Compile Final Report ===
+async def compile_report_node(state: WorkflowState) -> WorkflowState:
+    # LLM summarizes overall
+    prompt = ChatPromptTemplate.from_template("""
+You are a fact-check report compiler. Analyze the following state and generate a final report.
+State:
+- URL: {url}
+- Source Credibility: {credibility}
+- Claims Extracted: {claims}
+- Fact Check Results: {fact_checks}
+- Search Insights: {search_insights}
+Rules for verdict:
+- If most claims are verified → "verified"
+- If most claims are debunked → "debunked"
+- If mixed results → "mixture"
+- If insufficient evidence �� "unverified"
+{format_instructions}
+Respond ONLY with valid JSON. Do not include any markdown formatting, explanations, or text outside the JSON object.
+""")
+    llm = llm_wrapper.get_llm()
+    output_parser = JsonOutputParser(pydantic_object=FinalReport)
+    chain = prompt | llm | output_parser
+    try:
+        compiled = await chain.ainvoke({
+            "url": state.get("url", ""),
+            "credibility": state.get("credibility", {}),
+            "claims": state.get("claims", []),
+            "fact_checks": state.get("fact_checks", []),
+            "search_insights": state.get("search_insights", []),
+            "format_instructions": output_parser.get_format_instructions()
+        })
+        logger.info(f"Compiled report: {compiled}")
+        state["overall_verdict"] = compiled.get("overall_verdict", "unverified")
+        state["summary"] = compiled.get("summary", "No summary generated")
+        state["sources"] = [s for insight in state.get("search_insights", []) for s in insight["sources"]]
+    except Exception as e:
+        logger.error(f"Report compilation error: {str(e)}")
+        # Fallback: Create a basic report without LLM
+        state["overall_verdict"] = "unverified"
+        state["summary"] = f"Report compilation failed. {len(state.get('claims', []))} claims extracted, {len(state.get('fact_checks', []))} fact-checks completed."
+        state["sources"] = [s for insight in state.get("search_insights", []) for s in insight.get("sources", [])]
+    return state
+def decide_next_step(state: WorkflowState) -> str:
+    cred = state.get("credibility", {}).get("verdict", {}).get("trust_level", "unknown")
+    if cred in ["low", "very_low"]:
+        return END  # Still skip if very low
+    return "extraction_node"
+# === Orchestrator ===
+workflow = StateGraph(state_schema=WorkflowState)
+workflow.add_node("credibility_node", credibility_node)
+workflow.add_node("extraction_node", extraction_node)
+workflow.add_node("search_enrichment_node", search_enrichment_node)
+workflow.add_node("factcheck_node", factcheck_node)
+workflow.add_node("compile_report_node", compile_report_node)
+workflow.set_entry_point("credibility_node")
+workflow.add_conditional_edges(
+    "credibility_node", decide_next_step
+)
+workflow.add_edge("extraction_node", "search_enrichment_node")
+workflow.add_edge("search_enrichment_node", "factcheck_node")
+workflow.add_edge("factcheck_node", "compile_report_node")
+workflow.add_edge("compile_report_node", END)
+memory = MemorySaver()
+graph = workflow.compile(checkpointer=memory)
+async def run_orchestrator(url: str, selection:str) -> WorkflowState:
+    initial_state: WorkflowState = {
+        "url": url,
+        "selection": selection,
+        "credibility": {},
+        "claims": [],
+        "fact_checks": [],
+        "error": "",
+    }
+    final_state = await graph.ainvoke(initial_state, config={"configurable": {"thread_id": "main"}})
+    return final_state
+# Example usage
+if __name__ == "__main__":
+    test_url = "https://www.nbcnews.com/politics/donald-trump/trump-cnn-warner-bros-discovery-netflix-paramount-rcna248518"
+    test_selection = "Paramount initiated a hostile bid, offering shareholders $30 per share."
+    result_state = asyncio.run(run_orchestrator(test_url, test_selection))
+    if result_state.get("error"):
+        logger.error(f"Orchestration failed: {result_state['error']}")
+    else:
+        logger.info(f"Orchestration completed successfully. Fact-checks: {result_state['fact_checks']}")

app/services/shared_tools.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from langchain_core.tools import tool
+from app.core.cache import cache_get, cache_set, cache_delete, cache_stats
+from app.core.config import config
+from langchain_community.tools.tavily_search import TavilySearchResults
+@tool("cache_query")
+async def cache_query(key: str) -> str:
+    """
+    Query a value from the global cache. Use to check if data is cached.
+    Input: cache key (e.g., "claim:XYZ")
+    """
+    value = cache_get(key)
+    return str(value) if value else "Not found in cache"
+@tool("cache_invalidate")
+async def cache_invalidate(key: str) -> str:
+    """
+    Delete a key from global cache. Use to force refresh.
+    Input: cache key
+    """
+    deleted = cache_delete(key)
+    return "Deleted" if deleted else "Key not found"
+@tool("cache_stats")
+async def get_cache_stats() -> str:
+    """
+    Get global cache statistics. Use to monitor cache health.
+    """
+    return str(cache_stats())
+@tool("tavily_search")
+async def tavily_search(query: str, max_results: int = 5) -> str:
+    """
+    Advanced AI-powered web search. Use for complex research or when standard search lacks context.
+    Returns summarized results with sources.
+    """
+    tool = TavilySearchResults(
+        max_results=max_results,
+        api_key=config.TAVILY_API_KEY,  # Add to .env
+        search_depth = "advanced",
+        include_answer = True,
+        include_raw_content =True
+    )
+    results = await tool.ainvoke(input=query)
+    return str(results)  # Or parse to dict

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[project]
+name = "verifacts-backend"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Testimony Adekoya"}
+]
+readme = "README.md"
+requires-python = ">=3.10, <4.0.0"
+dependencies = [
+    "langchain-core (>=1.1.0,<2.0.0)",
+    "langchain-community (>=0.4.1,<0.5.0)",
+    "fastapi (>=0.121.3,<0.122.0)",
+    "uvicorn[standard] (>=0.38.0,<0.39.0)",
+    "pydantic (>=2.12.4,<3.0.0)",
+    "sqlalchemy (>=2.0.44,<3.0.0)",
+    "redis (>=7.1.0,<8.0.0)",
+    "httpx (>=0.28.1,<0.29.0)",
+    "python-multipart (>=0.0.20,<0.0.21)",
+    "langgraph (>=1.0.3,<2.0.0)",
+    "langchain-google-genai (>=3.1.0,<4.0.0)",
+    "python-dotenv (>=1.2.1,<2.0.0)",
+    "pytest (>=9.0.1,<10.0.0)",
+    "python-whois (>=0.9.6,<0.10.0)",
+    "tldextract (>=5.3.0,<6.0.0)",
+    "firecrawl (>=4.9.0,<5.0.0)",
+    "resend (>=2.19.0,<3.0.0)",
+    "newspaper4k (>=0.9.4.1,<0.10.0.0)",
+    "python-json-logger (>=4.0.0,<5.0.0)",
+    "langchain (>=1.1.3,<2.0.0)",
+    "tavily-python (>=0.7.14,<0.8.0)",
+    "langchain-openai (>=1.1.1,<2.0.0)",
+    "langchain-tavily (>=0.2.13,<0.3.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi
+uvicorn[standard]
+pydantic
+sqlalchemy
+pydantic[email]
+alembic
+redis
+httpx
+python-multipart
+langchain
+langchain-core
+langgraph
+langchain-community
+langchain-google-genai

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import patch, AsyncMock
+from app.api.main import main
+client = TestClient(main)
+@pytest.fixture
+def mock_graph_response():
+    """
+    Returns a fake state object that simulates a completed AI analysis.
+    """
+    return {
+        "is_verified_entity": True,
+        "identity_score": 0.85,
+        "verdict_status": "Verified",
+        "extracted_claims": ["Claim 1", "Claim 2"],
+        "claims_verified_count": 2,
+        "claims_sourced_count": 2,
+        "verification_results": [{"claim": "Claim 1", "status": "True"}],
+        "agent_reports": [
+            {
+                "agent_name": "Firecrawl Reader",
+                "output": ["Claim 1", "Claim 2"],
+                "errors": []
+            }
+        ]
+    }
+def test_health_check():
+    response = client.get("/health")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["status"] == "operational"
+    assert "version" in data == "`1.0.0`"
+@patch("app.api.v1.endpoints.verifacts_pipeline.ainvoke", new_callable=AsyncMock)
+def test_analyze_content(mock_ainvoke, mock_graph_response):
+    """
+    Test the /analyze endpoint with a mocked AI graph response.
+    """
+    mock_ainvoke.return_value = mock_graph_response
+    request_payload = {
+        "url": "https://example.com/article",
+        "selection": None,
+        "force_refresh": False
+    }
+    response = client.post("/api/v1/analyze", json=request_payload)
+    assert response.status_code == 200
+    data = response.json()
+    assert data["status"] == "Completed"
+    assert data["verdict"]["status"] == "Verified"
+    assert data["verdict"]["claims_verified"] == 2
+    assert data["identity"]["verified"] is True
+    assert data["identity"]["score"] == 0.85
+    assert len(data["details"]["reports"]) == 1
+    assert data["details"]["reports"][0]["agent"] == "Firecrawl Reader"
+@patch("app.api.v1.endpoints.verifacts_pipeline.ainvoke", new_callable=AsyncMock)
+def test_analyze_content_with_selection(mock_ainvoke, mock_graph_response):
+    """
+    Test the /analyze endpoint with a text selection and mocked AI graph response.
+    """
+    mock_ainvoke.return_value = mock_graph_response
+    request_payload = {
+        "url": "https://example.com/article",
+        "selection": "Some specific text from the article.",
+        "force_refresh": True
+    }
+    response = client.post("/api/v1/analyze", json=request_payload)
+    assert response.status_code == 200
+    data = response.json()
+    assert data["status"] == "Completed"
+    assert data["verdict"]["status"] == "Verified"
+    assert data["verdict"]["claims_verified"] == 2
+    assert data["identity"]["verified"] is True
+    assert data["identity"]["score"] == 0.85
+    assert len(data["details"]["reports"]) == 1
+    assert data["details"]["reports"][0]["agent"] == "Firecrawl Reader"
+@patch("app.api.v1.endpoints.verifacts_pipeline.ainvoke", new_callable=AsyncMock)
+def test_analyze_validation_error(mock_ainvoke):
+    """
+    Test the /analyze endpoint with invalid input to trigger validation error.
+    """
+    request_payload = {
+        "url": "not_a_valid_url",
+        "selection": None,
+        "force_refresh": False
+    }
+    response = client.post("/api/v1/analyze", json=request_payload)
+    assert response.status_code == 422  # Unprocessable Entity due to validation error