Testys commited on
Commit
55086fb
·
1 Parent(s): 5a278ef

feat: adding verifact services with backend code and multi-agent workflow

Browse files
.env.example ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ GEMINI_API_KEY=your_gemini_api_key_here
2
+ FIRECRAWL_API_KEY=your_firecrawl_api_key_here
3
+ URLSCAN_API_KEY=your_urlscan_api_key_here
4
+
.gitignore ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
CONTRIBUTING.md ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to Verifacts Backend
2
+
3
+ Welcome to the Verifacts engineering team! This guide will help you set up your development environment and understand our engineering standards.
4
+
5
+ ## 🚀 Environment Setup
6
+
7
+ We use **Poetry** for dependency management to ensure deterministic builds across all micro-modules.
8
+
9
+ ### 1. Installation
10
+
11
+ ```bash
12
+ # Install Project Dependencies
13
+ poetry install
14
+ ````
15
+
16
+ ### 2\. Configuration
17
+
18
+ Copy the example environment file:
19
+
20
+ ```bash
21
+ cp .env.example .env
22
+ ```
23
+
24
+ **Required Variables:**
25
+
26
+ * `OPENAI_API_KEY`: For LLM extraction.
27
+ * `FIRECRAWL_API_KEY`: For web scraping.
28
+ * `GOOGLE_FACT_CHECK_KEY`: For verification.
29
+
30
+ ### 3\. Running the Server
31
+
32
+ Start the hot-reloading development server:
33
+
34
+ ```bash
35
+ poetry run uvicorn app.api.server:app --reload
36
+ ```
37
+
38
+ -----
39
+
40
+ ## 🌳 Git Workflow & Branching Strategy
41
+
42
+ We follow a strict branching model to keep our codebase stable. **Never push directly to `main`.**
43
+
44
+ ### Branch Naming Convention
45
+
46
+ * **Features:** `feat/short-description` (e.g., `feat/add-sentiment-node`)
47
+ * **Bug Fixes:** `fix/short-description` (e.g., `fix/firecrawl-timeout`)
48
+ * **Documentation:** `docs/short-description` (e.g., `docs/update-api-schema`)
49
+ * **Chore/Refactor:** `chore/short-description` (e.g., `chore/bump-poetry-version`)
50
+
51
+ ### The Workflow
52
+
53
+ 1. **Sync with Main:**
54
+ ```bash
55
+ git checkout main
56
+ git pull origin main
57
+ ```
58
+ 2. **Create Branch:**
59
+ ```bash
60
+ git checkout -b feat/my-new-feature
61
+ ```
62
+ 3. **Code & Test:** Write your code and ensure `poetry run pytest` passes.
63
+ 4. **Push & PR:** Push your branch and open a Pull Request (PR) for review.
64
+
65
+ -----
66
+
67
+ ## 📝 Commit Message Standards
68
+
69
+ We use **Conventional Commits** to automate our changelogs. Your commit message must look like this:
70
+
71
+ `<type>(<scope>): <short summary>`
72
+
73
+ ### Types
74
+
75
+ * `feat`: A new feature (e.g., adding a new LangGraph node).
76
+ * `fix`: A bug fix.
77
+ * `docs`: Documentation only changes.
78
+ * `style`: Formatting, missing semi-colons, etc. (no code change).
79
+ * `refactor`: A code change that neither fixes a bug nor adds a feature.
80
+ * `perf`: A code change that improves performance.
81
+ * `test`: Adding missing tests.
82
+ * `chore`: Maintainance tasks (e.g., updating `.gitignore`).
83
+
84
+ ### Examples
85
+
86
+ * ✅ `feat(graph): add sentiment analysis node to workflow`
87
+ * ✅ `fix(api): handle 404 error from Firecrawl`
88
+ * ✅ `docs(readme): update setup instructions for Windows`
89
+ * ❌ `Fixed the bug` (Too vague)
90
+ * ❌ `Added new agent` (Missing scope)
91
+
92
+ -----
93
+
94
+ ## 🛠️ How to Add a New Feature (The "Node" Workflow)
95
+
96
+ Adding intelligence to Veritas means adding a **Node** to the LangGraph. Follow this 4-step process:
97
+
98
+ ### Step 1: Create the Logic (The Module)
99
+
100
+ Create a new file in `app/graph/nodes/`. It must accept `AgentState` and return a dictionary of updates.
101
+
102
+ * *File:* `app/graph/nodes/sentiment.py`
103
+ * *Function:* `async def sentiment_node(state: AgentState) -> Dict[str, Any]: ...`
104
+
105
+ ### Step 2: Update the State
106
+
107
+ If your node produces new data (e.g., a "sentiment score"), define it in the shared state.
108
+
109
+ * *File:* `app/graph/state.py`
110
+ * *Action:* Add `sentiment_score: float` to the `AgentState` TypedDict.
111
+
112
+ ### Step 3: Register in the Graph
113
+
114
+ Wire your new node into the orchestration flow.
115
+
116
+ * *File:* `app/graph/workflow.py`
117
+ * *Action:*
118
+ 1. `workflow.add_node("sentiment", sentiment_node)`
119
+ 2. Define when it runs (e.g., `workflow.add_edge("reader", "sentiment")`).
120
+
121
+ ### Step 4: Expose via API (Optional)
122
+
123
+ If the frontend needs to see this data, update the response model.
124
+
125
+ * *File:* `app/api/v1/models.py` (or `server.py`)
126
+ * *Action:* Add the field to the Pydantic Response model.
127
+
128
+ -----
129
+
130
+ ## 🧪 Testing Requirements
131
+
132
+ Before submitting a PR, ensure you have added tests for your new node.
133
+
134
+ ```bash
135
+ # Run unit tests
136
+ poetry run pytest
137
+
138
+ # Run linting manually (Recommended)
139
+ poetry run ruff check .
140
+ ```
141
+
142
+ ## Pull Request Reviews
143
+ All PRs must be reviewed by at least one other team member. Look for:
144
+
145
+ * Code quality and adherence to standards.
146
+ * Proper testing coverage.
147
+ * Clear and descriptive commit messages.
148
+
149
+
150
+ Thank you for contributing to Verifacts! Your efforts help us build a reliable and intelligent verification platform.
README.md CHANGED
@@ -1,11 +1 @@
1
- ---
2
- title: Verifacts Backend
3
- emoji: 🌖
4
- colorFrom: blue
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # verifacts-backend
 
 
 
 
 
 
 
 
 
 
app/__init__.py ADDED
File without changes
app/api/main.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+
4
+ from app.api.v1.endpoints import router as v1_router
5
+ from app.core.config import config
6
+
7
+ main = FastAPI(
8
+ title=config.PROJECT_NAME,
9
+ version=config.VERSION,
10
+ openapi_url=f"{config.API_PREFIX}/openapi.json"
11
+ )
12
+
13
+ main.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"],
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ main.include_router(v1_router)
22
+
23
+ @main.get("/")
24
+ async def root():
25
+ return {"message": "Welcome to the Verifacts Backend API!"}
26
+
27
+ @main.get("/health")
28
+ async def health_check():
29
+ return {
30
+ "status": "operational",
31
+ "message": "The Verifacts Backend API is running smoothly.",
32
+ "version": config.VERSION
33
+ }
34
+
app/api/v1/__init__.py ADDED
File without changes
app/api/v1/endpoints.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from fastapi import APIRouter, HTTPException, Depends
3
+ from app.core.models import AnalysisRequest, AnalysisResponse, IdentityData, VerdictData
4
+ from app.core.config import config
5
+
6
+ logger = logging.getLogger(__name__)
7
+ logger.setLevel(logging.INFO)
8
+
9
+ router = APIRouter(prefix=config.API_PREFIX, tags=["v1"])
10
+
11
+ @router.post("/analyze", response_model=AnalysisResponse)
12
+ async def analyze_content(request: AnalysisRequest) -> AnalysisResponse:
13
+ """
14
+ Core v1 endpoint to analyze and verify the sources of web contents.
15
+ Triggers the analysis pipeline and multi-agent Langgraph workflow.
16
+ """
17
+ try:
18
+ initial_state = {
19
+ "url": str(request.url),
20
+ "selection": request.selection,
21
+ "force_refresh": request.force_refresh,
22
+ "claims": [],
23
+ "errors": [],
24
+ "verification_results": [],
25
+ "extracted_claims": [],
26
+ "agent_reports": [],
27
+ }
28
+ logger.info(f"Starting analysis for URL: {request.url}")
29
+
30
+ final_state = initial_state
31
+
32
+ identity_data = IdentityData(
33
+ verified=final_state.get("is_verified", False),
34
+ score=final_state.get("credibility_score", 0.0),
35
+ )
36
+ verdict_data = VerdictData(
37
+ status=final_state.get("verdict_status", "Unverified"),
38
+ claims_counted=final_state.get("claims_counted", 0),
39
+ claims_verified=final_state.get("claims_verified", 0),
40
+ claims_sourced=final_state.get("claims_sourced", 0)
41
+ )
42
+
43
+ agent_reports = final_state.get("agent_reports", [])
44
+ formatted_reports = [
45
+ {
46
+ "agent": report.get("agent_name", "unknown"),
47
+ "claims": report.get("output", []),
48
+ "errors": report.get("errors", [])
49
+ }
50
+ for report in agent_reports
51
+ ]
52
+
53
+ response = AnalysisResponse(
54
+ status=final_state.get("status", "Completed"),
55
+ verdict=verdict_data,
56
+ details={
57
+ "reports": formatted_reports,
58
+ "raw_claims": final_state.get("verification_results", [])
59
+ },
60
+ identity=identity_data
61
+ )
62
+ return response
63
+
64
+ except Exception as e:
65
+ logger.error(f"Error during analysis: {str(e)}")
66
+ raise HTTPException(status_code=500, detail=f"Analysis of web content failed {str(e)}")
app/core/__init__.py ADDED
File without changes
app/core/cache.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any, Optional
3
+
4
+ from redis import Redis
5
+ from langchain_core.globals import set_llm_cache
6
+ from langchain_community.cache import RedisCache, RedisSemanticCache
7
+ from langchain_openai import OpenAIEmbeddings
8
+
9
+ from app.core.config import config
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.INFO)
13
+
14
+ redis_client = Redis.from_url(config.REDIS_URL)
15
+
16
+
17
+ def init_global_cache(semantic: bool=True) -> None:
18
+ """Initializes a global Redis cache for LangChain operations."""
19
+ global redis_client
20
+ if not redis_client:
21
+ logger.warning("Redis client is not configured; caching will be disabled.")
22
+ return
23
+
24
+ if semantic:
25
+ logger.info("Initializing Redis Semantic Cache with Google Embeddings.")
26
+ embeddings = OpenAIEmbeddings(
27
+ model="text-embedding-3-small"
28
+ )
29
+ cache = RedisSemanticCache(
30
+ redis_client=redis_client,
31
+ embedding_function=embeddings,
32
+ index_name=config.REDIS_SEMANTIC_INDEX or "langchain_semantic_cache",
33
+ score_threshold=0.85
34
+ )
35
+ else:
36
+ logger.info("Initializing standard Redis Cache.")
37
+ cache = RedisCache(redis_client=redis_client)
38
+
39
+ from langchain_core.globals import set_llm_cache
40
+ set_llm_cache(cache)
41
+ logger.info("Global Redis cache initialized successfully.")
42
+
43
+ try:
44
+ # Test the connection
45
+ redis_client.ping()
46
+ logger.info("Successfully connected to Redis server.")
47
+ except Exception as e:
48
+ logger.error(f"Failed to connect to Redis server: {e}")
49
+ redis_client = None
50
+
51
+
52
+ def cache_get(key:str) -> Optional[Any]:
53
+ """Retrieve a value from the Redis cache by key."""
54
+ global redis_client
55
+ if not redis_client:
56
+ logger.warning("Redis client is not configured; cannot get cache.")
57
+ return None
58
+ try:
59
+ value = redis_client.get(key)
60
+ if value is not None:
61
+ logger.info(f"Cache hit for key: {key}")
62
+ else:
63
+ logger.info(f"Cache miss for key: {key}")
64
+ return value
65
+ except Exception as e:
66
+ logger.error(f"Error retrieving key {key} from cache: {e}")
67
+ return None
68
+
69
+ def cache_set(key:str, value:Any, ttl:int=config.CACHE_TTL) -> None:
70
+ """Set a value in the Redis cache with an optional TTL."""
71
+ global redis_client
72
+ if not redis_client:
73
+ logger.warning("Redis client is not configured; cannot set cache.")
74
+ return
75
+ try:
76
+ redis_client.set(name=key, value=value, ex=ttl)
77
+ logger.info(f"Cache set for key: {key} with TTL: {ttl} seconds")
78
+ except Exception as e:
79
+ logger.error(f"Error setting key {key} in cache: {e}")
80
+
81
+ def cache_delete(key:str) -> None:
82
+ """Delete a value from the Redis cache by key."""
83
+ global redis_client
84
+ if not redis_client:
85
+ logger.warning("Redis client is not configured; cannot delete cache.")
86
+ return
87
+ try:
88
+ redis_client.delete(key)
89
+ logger.info(f"Cache deleted for key: {key}")
90
+ except Exception as e:
91
+ logger.error(f"Error deleting key {key} from cache: {e}")
92
+
93
+
94
+ def cache_stats() -> Optional[dict]:
95
+ """Retrieve Redis cache statistics."""
96
+ global redis_client
97
+ if not redis_client:
98
+ logger.warning("Redis client is not configured; cannot get stats.")
99
+ return None
100
+ try:
101
+ info = redis_client.info()
102
+ stats = {
103
+ "used_memory_human": info.get("used_memory_human"),
104
+ "keyspace_hits": info.get("keyspace_hits"),
105
+ "keyspace_misses": info.get("keyspace_misses"),
106
+ "connected_clients": info.get("connected_clients"),
107
+ "uptime_in_seconds": info.get("uptime_in_seconds"),
108
+ }
109
+ logger.info(f"Redis cache stats: {stats}")
110
+ return stats
111
+ except Exception as e:
112
+ logger.error(f"Error retrieving Redis stats: {e}")
113
+ return None
114
+
115
+ # Usage Example
116
+ # init_global_cache(semantic=True)
117
+ # #ping
118
+
119
+ # if __name__ == "__main__":
120
+ # if not redis_client:
121
+ # logger.warning("Redis client is not configured; skipping ping.")
122
+
123
+ # if redis_client:
124
+ # try:
125
+ # redis_client.ping()
126
+ # logger.info("Ping to Redis server successful.")
127
+ # except Exception as e:
128
+ # logger.error(f"Ping to Redis server failed: {e}")
app/core/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+ from typing import Optional
4
+ from dotenv import load_dotenv
5
+
6
+
7
+ load_dotenv() # Load environment variables from a .env file if present
8
+
9
+ class Config(BaseSettings):
10
+ """
11
+ Application configuration settings.
12
+ Reads from environment variables by default.
13
+ """
14
+ PROJECT_NAME: str = "Verifacts Backend"
15
+ VERSION: str = "1.0.0"
16
+ API_PREFIX: str = "/api/v1"
17
+
18
+ SECRET_KEY: str = os.getenv("SECRET_KEY", "default_secret_key")
19
+
20
+ GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
21
+ LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME", "gemini-2.5-flash")
22
+ LLM_TEMPERATURE: float = float(os.getenv("LLM_TEMPERATURE", "0"))
23
+ LLM_MAX_TOKEN: int = int(os.getenv("LLM_MAX_TOKEN", "1024"))
24
+ FIRECRAWL_API_KEY: Optional[str] = os.getenv("FIRECRAWL_API_KEY")
25
+ URLSCAN_API_KEY: Optional[str] = os.getenv("URLSCAN_API_KEY")
26
+ REDIS_URL: str = os.getenv("REDIS_URL", "redis://localhost:6379/0")
27
+ REDIS_HOST: Optional[str] = os.getenv("REDIS_HOST")
28
+ REDIS_PORT: Optional[int] = os.getenv("REDIS_PORT")
29
+ REDIS_PASSWORD: Optional[str] = os.getenv("REDIS_PASSWORD")
30
+ REDIS_DB: Optional[int] = os.getenv("REDIS_DB")
31
+
32
+ # API Configuration
33
+ GOOGLE_FACT_CHECK_API_KEY: str = os.getenv("GOOGLE_FACT_CHECK_KEY", "")
34
+ FACT_CHECK_API_URL: str = (
35
+ "https://factchecktools.googleapis.com/v1alpha1/claims:search"
36
+ )
37
+ TAVILY_API_KEY: Optional[str] = os.getenv("TAVILY_API_KEY")
38
+
39
+ # Performance Settings
40
+ API_TIMEOUT: int = 2 # seconds
41
+ MAX_BATCH_SIZE: int = 20
42
+
43
+ # Cache Settings (for future Redis integration)
44
+ CACHE_ENABLED: bool = True
45
+ CACHE_TTL: int = 86400 # 24 hours in seconds
46
+
47
+ model_config = SettingsConfigDict(
48
+ env_file=".env",
49
+ env_file_encoding="utf-8",
50
+ case_sensitive=True
51
+ )
52
+
53
+
54
+ config = Config()
app/core/models.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, HttpUrl
2
+ from typing import Optional, List, Dict, Any, Literal
3
+
4
+
5
+ class AnalysisRequest(BaseModel):
6
+ url: HttpUrl = Field(..., description="The URL of the webpage to analyze.")
7
+ selection: Optional[str] = Field(
8
+ None,
9
+ description="Optional specific text selection from the webpage."
10
+ )
11
+ force_refresh: bool = Field(
12
+ False,
13
+ description="Whether to force refresh the cached analysis."
14
+ )
15
+
16
+
17
+ class IdentityData(BaseModel):
18
+ verified: bool = Field(..., description="Whether the source is verified.")
19
+ score: float = Field(..., description="Credibility score of the source (0.0 to 1.0).")
20
+
21
+ class VerdictData(BaseModel):
22
+ status: str = Field(..., description="Verdict status (e.g., true, false, mixed).")
23
+ claims_counted: int = Field(0, description="Number of claims evaluated.")
24
+ claims_verified: int = Field(0, description="Number of claims verified as true.")
25
+ claims_sourced: int = Field(0, description="Number of claims with sources provided.")
26
+
27
+ class AnalysisResponse(BaseModel):
28
+ status: str = Field(..., description="Status of the analysis request.")
29
+ verdict: VerdictData = Field(..., description="Detailed verdict data.")
30
+ identity: IdentityData = Field(..., description="Identity verification data of the source.")
31
+ details: Dict[str, Any] = Field(..., description="Detailed agent reports and findings.")
32
+
33
+
34
+ class Provenance(BaseModel):
35
+ source: Literal["selection", "extracted", "user_provided"] = Field(..., description="Source of the claim.")
36
+ url: Optional[HttpUrl] = Field(None, description="URL from which the claim was extracted, if applicable.")
37
+ context: Optional[str] = Field(None, description="Contextual information about the claim.")
38
+
39
+ class Claim(BaseModel):
40
+ claim_id: str
41
+ text: str = Field(..., description="The atomic factual claim statement")
42
+ normalized_text: Optional[str] = Field(None, description="Normalized version of the claim text.")
43
+ provenance: Provenance = Field(..., description="Provenance information of the claim.")
44
+ confidence: Optional[float] = Field(None, description="Confidence score of claim extraction (0.0 to 1.0).")
45
+ claim_type: Literal["factual", "opinion", "mixed", "ambiguous"] = Field(..., description="Type of the claim.")
46
+
47
+ class CredibilityVerdict(BaseModel):
48
+ trust_level: str = Field(..., description="Overall trust level of the source (e.g., high, medium, low).")
49
+ score: float = Field(..., description="Credibility score of the source (0-100).")
50
+ red_flags: List[str] = Field(..., description="List of identified red flags affecting credibility.")
51
+ summary: str = Field(..., description="Summary of the credibility assessment.")
52
+ source_used: list[str] = Field(..., description="List of sources used in the credibility assessment.")
53
+
54
+
55
+ class FactCheckVerdict(BaseModel):
56
+ """Result for a single claim verification"""
57
+ claim: str = Field(..., description="The factual claim being verified")
58
+ verdict: str = Field(..., description="verified | debunked | mixture | unverified")
59
+ textual_rating: Optional[str] = Field(None, description="Textual rating from the fact-checker")
60
+ corroboration_url: Optional[str] = Field(None, description="URL to the fact-check source")
61
+ fact_checker: Optional[str] = Field(None, description="Name of the fact-checking organization")
62
+ checked_date: Optional[str] = None
63
+
64
+
65
+ class VerifyResponse(BaseModel):
66
+ """Response model for /verify endpoint"""
67
+
68
+ status: str # "success" or "error"
69
+ mode: str # "granular" or "full"
70
+ data: dict
71
+
72
+ # === Final Output Schema ===
73
+ class FinalReport(BaseModel):
74
+ url: str = Field(..., description="Original URL")
75
+ credibility: Dict = Field(..., description="Source credibility assessment")
76
+ claims: List[str] = Field(..., description="Extracted factual claims")
77
+ fact_checks: List[Dict] = Field(..., description="Fact-check verdicts per claim")
78
+ search_insights: List[Dict] = Field(default=[], description="Tavily search results with snippets for enrichment")
79
+ overall_verdict: str = Field(..., description="Final truth rating: verified | debunked | mixture | unverified")
80
+ summary: str = Field(..., description="One-paragraph overall summary")
81
+ sources: List[str] = Field(default=[], description="Key corroborating URLs")
82
+
83
+
app/services/__init__.py ADDED
File without changes
app/services/claims/__init__.py ADDED
File without changes
app/services/claims/agent.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import uuid
3
+ from typing import List, Dict, Any, Optional
4
+
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+ from pydantic import BaseModel, Field
8
+
9
+ from app.services.llm_wrapper import llm_wrapper
10
+ from app.services.claims.tools import ClaimTools
11
+ from app.core.models import Claim, Provenance
12
+
13
+ logger = logging.getLogger(__name__)
14
+ logger.setLevel(logging.INFO)
15
+
16
+
17
+ class ExtractedClaimItem(BaseModel):
18
+ text: str = Field(..., description="The extracted claim text.")
19
+ type: str = Field(..., description="The type of the claim (factual, opinion, etc.).")
20
+
21
+
22
+ class ClaimsList(BaseModel):
23
+ claims: List[ExtractedClaimItem] = Field(..., description="List of extracted claims.")
24
+
25
+
26
+
27
+ class ClaimExtractionAgent:
28
+ """
29
+ Agent 2: Claim Extraction Agent.
30
+ Roles:
31
+ 1. Decide strategy (Passthrough vs Atomization).
32
+ 2. Call Scraping Tools if needed.
33
+ 3. Use LLM to extract and classify claims.
34
+ """
35
+
36
+ def __init__(self):
37
+ self.llm = llm_wrapper.get_llm()
38
+ self.output_parser = JsonOutputParser(pydantic_object=ClaimsList)
39
+ self.tools = ClaimTools()
40
+
41
+
42
+ async def run(self, verdict:Optional[Dict] = None) -> List[Claim]:
43
+ """
44
+ Main method to run the Claim Extraction Agent.
45
+ """
46
+ text_to_process = ""
47
+ source_type = "selection"
48
+ context_url = verdict.get("url") if verdict else None
49
+ selection = verdict.get("selection") if verdict else None
50
+ url = context_url
51
+ cleaned_bg = "" # Initialize here to avoid 'not defined' errors
52
+
53
+ if selection:
54
+ logger.info("Using user-provided text selection for claim extraction.")
55
+ text_to_process = selection
56
+
57
+ clean_sel, _ = self.tools.sanitize_text(selection, max_length=5000)
58
+ if self.tools.looks_like_propmpt_injection(clean_sel):
59
+ logger.warning("Potential prompt injection detected in user selection.")
60
+ return [self._create_ambiguous_claim("Potential prompt injection detected in user selection.", url, source_type)]
61
+
62
+ text_to_process = clean_sel
63
+
64
+ if url:
65
+ try:
66
+ logger.info(f"Fetching background context from FireCrawl for URL: {url}")
67
+ full_page_text = await self.tools.scrape_article_text.ainvoke(url)
68
+ if full_page_text:
69
+ context_snippet, _ = self.tools.sanitize_text(full_page_text, max_length=2000)
70
+ cleaned_bg = context_snippet.replace("\n", " ")
71
+ logger.info("Successfully fetched background context for selection.")
72
+
73
+ except Exception as e:
74
+ logger.warning(f"Failed to fetch background context from FireCrawl: {str(e)}")
75
+
76
+ elif url:
77
+ logger.info(f"No text selection provided, scraping article text from {url}.")
78
+
79
+ scraped_text = await self.tools.scrape_article_text.ainvoke(url)
80
+
81
+ if not scraped_text:
82
+ logger.warning("No text could be extracted from the article.")
83
+ return [self._create_ambiguous_claim("No text could be extracted from the article.", url, "extracted")]
84
+
85
+ text_to_process = scraped_text
86
+ source_type = "extracted"
87
+
88
+ if not text_to_process:
89
+ logger.error("No text available for claim extraction after processing.")
90
+ return [self._create_ambiguous_claim("No text available for claim extraction.", url, source_type)]
91
+
92
+
93
+ is_short_selection = len(text_to_process.split()) < 50
94
+ has_complexity = " and " in text_to_process.lower() or ";" in text_to_process or "," in text_to_process
95
+
96
+ should_atomize = (source_type == "extracted") or (has_complexity and cleaned_bg != "")
97
+
98
+ if should_atomize and self.llm:
99
+ # Fixed: Correct argument order matching method signature
100
+ return await self._atomize_and_extract_claims(
101
+ text=text_to_process,
102
+ url=url,
103
+ source=source_type, # This is the source type (selection/extracted)
104
+ source_type=source_type,
105
+ context=cleaned_bg
106
+ )
107
+ else:
108
+ return [self._create_ambiguous_claim(text_to_process, url, source_type)]
109
+
110
+
111
+ async def _atomize_and_extract_claims(
112
+ self,
113
+ text: str,
114
+ url: Optional[str],
115
+ source: str,
116
+ source_type: str,
117
+ context: Optional[str] = None
118
+ ) -> List[Claim]:
119
+ """
120
+ Atomizes the text into multiple claims using the LLM.
121
+ """
122
+
123
+ context_instruction = ""
124
+
125
+ if source == "selection" and context:
126
+ context_instruction = (
127
+ f"CONTEXT INFO:\n"
128
+ f"The user selected the text below from a webpage ({url or 'unknown'}).\n"
129
+ f"Here is a snippet of the page content to help you understand the topic:\n"
130
+ f"--- BEGIN CONTEXT ---\n{context}\n--- END CONTEXT ---\n"
131
+ f"Use this context to resolve ambiguities (e.g. what 'it' refers to), but ONLY extract claims from the 'USER SELECTION'."
132
+ )
133
+
134
+ elif source == "selection" and url:
135
+ context_instruction = f"SOURCE URL: {url}. Use the domain to infer the likely topic if needed."
136
+
137
+ elif source == "extracted":
138
+ context_instruction = f"SOURCE URL: {url or 'unknown'}. Use the domain to infer the likely topic if needed."
139
+
140
+ prompt = ChatPromptTemplate.from_messages([
141
+ ("system", "You are an expert fact-checker. "
142
+ "Your task is to extract distinct, checkable factual claims from the provided text.\n"
143
+ "Rules:\n"
144
+ "1. Split compound statements (e.g. 'X is true and Y is false' -> [X, Y]).\n"
145
+ "2. Ignore pure opinions or rhetorical questions.\n"
146
+ "3. Keep claims concise and self-contained.\n"
147
+ "{context_instruction}\n\n"
148
+ "{format_instructions}"),
149
+ ("user", "USER SELECTION to analyze:\n{text}")
150
+ ])
151
+
152
+ chain = prompt | self.llm | self.output_parser
153
+
154
+ try:
155
+ result = await chain.ainvoke({
156
+ "text": text,
157
+ "context_instruction": context_instruction,
158
+ "format_instructions": self.output_parser.get_format_instructions()
159
+ })
160
+ logger.info(f"Successfully extracted claims using atomization {result}.")
161
+
162
+ claims = []
163
+
164
+ # Handle both dict and list responses from the parser
165
+ claims_list = result.get("claims", []) if isinstance(result, dict) else result
166
+
167
+ for item in claims_list:
168
+ if isinstance(item, dict):
169
+ claim_text = item.get("text", str(item))
170
+ claim_type = item.get("type", "factual")
171
+ else:
172
+ claim_text = str(item)
173
+ claim_type = "factual"
174
+
175
+ claims.append(Claim(
176
+ claim_id=str(uuid.uuid4()),
177
+ text=claim_text,
178
+ normalized_text=claim_text.lower().strip(),
179
+ claim_type=claim_type,
180
+ provenance=Provenance(
181
+ source=source_type,
182
+ url=url,
183
+ context=context_instruction[:200] + "..." if context_instruction else None,
184
+ ),
185
+ confidence=0.9 if claim_type == "factual" else 0.6
186
+ ))
187
+ logger.info(f"Extracted {len(claims)} claims using atomization.")
188
+ return claims
189
+
190
+ except Exception as e:
191
+ logger.error(f"Error during claim atomization and extraction: {str(e)}")
192
+ # Ensure source_type has a valid value for Provenance
193
+ valid_source_type = source_type if source_type in ("selection", "extracted", "user_provided") else "extracted"
194
+ return [self._create_ambiguous_claim("Error during claim extraction.", url, valid_source_type)]
195
+
196
+ def _create_ambiguous_claim(self, text: str, url: Optional[str], source_type: str) -> Claim:
197
+ """Fallback to create an ambiguous claim when extraction fails."""
198
+ # Ensure source_type has a valid value
199
+ valid_source_type = source_type if source_type in ("selection", "extracted", "user_provided") else "extracted"
200
+ return Claim(
201
+ claim_id=str(uuid.uuid4()),
202
+ text=text,
203
+ normalized_text=text.lower().strip(),
204
+ claim_type="ambiguous",
205
+ provenance=Provenance(
206
+ source=valid_source_type,
207
+ url=url,
208
+ context=text[:100] + "..." if text else None
209
+ ),
210
+ confidence=0.0
211
+ )
212
+
213
+ # Example Usage:
214
+ async def main():
215
+ verdict = {'url': 'https://databackedafrica.com/', 'trust_level': 'medium-high', 'score': 80, 'red_flags': ['Brand new TLS certificate (3 days'], 'summary': None, 'source_used': ['https://databackedafrica.com/']}
216
+ agent = ClaimExtractionAgent()
217
+ claims = await agent.run(verdict)
218
+ for claim in claims:
219
+ print(f"Claim ID: {claim.claim_id}, Text: {claim.text}, Type: {claim.claim_type}, Confidence: {claim.confidence}")
220
+
221
+ if __name__ == "__main__":
222
+ import asyncio
223
+ asyncio.run(main())
app/services/claims/tools.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from typing import List, Dict, Any, Tuple, Optional
4
+ from langchain_core.tools import tool
5
+ from langchain_community.document_loaders.firecrawl import FireCrawlLoader
6
+
7
+ from app.core.config import config
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
+
12
+
13
+ class ClaimTools:
14
+ """
15
+ A collection of tools for fetching, extracting and cleaning texts
16
+ for the claim extraction agent.
17
+ """
18
+
19
+ @staticmethod
20
+ def sanitize_text(text: str, max_length: Optional[int] = None) -> Tuple[str, ...]:
21
+ """
22
+ Cleans and sanitizes the input text by removing unwanted characters,
23
+ excessive whitespace, and truncating to max_length if specified.
24
+
25
+ Args:
26
+ text (str): The input text to sanitize.
27
+ max_length (Optional[int]): Maximum length of the sanitized text.
28
+
29
+ Returns:
30
+ (cleaned_text, was_truncated): A tuple containing the cleaned text and a boolean indicating if truncation occurred.
31
+
32
+ """
33
+ if not text:
34
+ return "", False
35
+
36
+ text = text.replace("\u200b", " ").replace("\ufeff", "") # Remove zero-width spaces and BOM
37
+ cleaned = text.replace("\r\n", "\n").replace("\r", "\n")
38
+ cleaned = " ".join(cleaned.split()) # Collapse multiple spaces/newlines
39
+
40
+ was_truncated = False
41
+ if max_length and len(cleaned) > max_length:
42
+ cleaned = cleaned[:max_length]
43
+ was_truncated = True
44
+
45
+ return cleaned, was_truncated
46
+
47
+ @staticmethod
48
+ @tool("scrape_article_text")
49
+ async def scrape_article_text(url: str) -> str:
50
+ """
51
+ Extracts the main body text from an article given its URL.
52
+ Useful when user provides a URL without specific text selection.
53
+ """
54
+ # try:
55
+ # from newspaper import Article
56
+
57
+ # logger.info(f"Scraping article text from URL: {url}")
58
+ # article = Article(url)
59
+ # article.download()
60
+ # if article.download_state == 2: # Downloaded
61
+ # article.parse()
62
+ # text = article.text or " "
63
+
64
+ # if text or len(text.strip()) >= 50:
65
+ # logger.warning(f"No text extracted from article at URL: {url}")
66
+ # return text.strip()
67
+
68
+ # logger.info("Article text seems insufficient, attempting to use Newspaper3k's NLP.")
69
+
70
+ # except Exception as e:
71
+ # logger.error(f"Error scraping article text from {url}: {str(e)}")
72
+ # return ""
73
+
74
+
75
+ if not config.FIRECRAWL_API_KEY:
76
+ logger.error("FIRECRAWL_API_KEY not set. Cannot use FireCrawl for extraction.")
77
+ return ""
78
+
79
+ logger.info("Fallback for Newspaper4k: Using FireCrawl to extract article text.")
80
+ try:
81
+
82
+ loader = FireCrawlLoader(
83
+ url=url,
84
+ api_key=config.FIRECRAWL_API_KEY,
85
+ mode="scrape"
86
+ )
87
+ documents = await loader.aload() # ← async version
88
+ logger.info(f"FireCrawl returned {len(documents)} documents for {url}")
89
+ if not documents:
90
+ logger.warning(f"FireCrawl returned no documents for {url}")
91
+ return ""
92
+
93
+ text = "\n\n".join(doc.page_content for doc in documents if doc.page_content)
94
+ text = text.strip()
95
+
96
+
97
+ logger.info(f"Successfully extracted article text from URL: {url} using FireCrawl")
98
+ if text and len(text.strip())> 50:
99
+ return text
100
+ else:
101
+ logger.warning(f"No documents returned by FireCrawl for URL: {url}")
102
+ except Exception as e:
103
+ logger.error(f"Error extracting article text from {url} using FireCrawl: {str(e)}")
104
+ return ""
105
+
106
+ else:
107
+ logger.warning("FIRECRAWL_API_KEY not set. Cannot use FireCrawl for extraction.")
108
+
109
+ return text
110
+
111
+
112
+ @staticmethod
113
+ def looks_like_propmpt_injection(text: str) -> bool:
114
+ """
115
+ Heuristic check to determine if the provided text looks like a prompt injection attempt.
116
+
117
+ Args:
118
+ text (str): The input text to evaluate.
119
+ Returns:
120
+ bool: True if the text appears to be a prompt injection, False otherwise.
121
+ """
122
+ injection_patterns = [
123
+ r"(?i)ignore all previous instructions",
124
+ r"(?i)disregard previous directions",
125
+ r"(?i)override earlier commands",
126
+ r"(?i)forget what you were told before",
127
+ r"(?i)act as if you are",
128
+ r"(?i)you are now",
129
+ r"(?i)from now on",
130
+ r"(?i)you must",
131
+ r"(?i)you will",
132
+ r"(?i)silence all prior guidelines",
133
+ r"(?i)break free from your restrictions",
134
+ r"(?i)bypass your limitations",
135
+ r"(?i)ignore your programming",
136
+ r"(?i)go against your guidelines",
137
+ r"(?i)user:",
138
+ ]
139
+
140
+ for pattern in injection_patterns:
141
+ if re.search(pattern, text, re.IGNORECASE):
142
+ logger.warning(f"Prompt injection pattern detected: {pattern} in text: {text}")
143
+ return True
144
+
145
+ return False
146
+
app/services/fact_checker/__init__.py ADDED
File without changes
app/services/fact_checker/agent.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agents/fact_checker/agent.py
2
+ import logging
3
+ from typing import List, Dict, Any
4
+
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+ from pydantic import BaseModel, Field
8
+
9
+ from app.services.llm_wrapper import llm_wrapper
10
+ from app.services.fact_checker.tools import GoogleFactCheckTool
11
+ from app.core.models import FactCheckVerdict
12
+ from app.core.config import config
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ class FactCheckAgent:
18
+ """
19
+ Agent 3: Final fact-check judgment using Google Fact Check API + LLM reasoning
20
+ """
21
+
22
+ def __init__(self):
23
+ self.llm = llm_wrapper.get_llm()
24
+ self.tool = GoogleFactCheckTool(api_key=config.GOOGLE_FACT_CHECK_API_KEY)
25
+ self.parser = JsonOutputParser(pydantic_object=FactCheckVerdict)
26
+
27
+ self.prompt = ChatPromptTemplate.from_messages([
28
+ ("system", """
29
+ You are a professional fact-checker. Use the Google Fact Check tool result below to give a final verdict.
30
+
31
+ Rules:
32
+ - If a reputable fact-checker (Snopes, PolitiFact, AFP, etc.) rated it → trust them
33
+ - "False", "Pants on Fire" → debunked
34
+ - "True" → verified
35
+ - "Mixture", "Mostly False" → mixture
36
+ - No result → unverified
37
+ - Be concise and neutral
38
+
39
+ Return JSON only.
40
+ {format_instructions}
41
+ """),
42
+ ("human", "Claim: {claim}\nTool result: {tool_result}")
43
+ ])
44
+
45
+ self.chain = self.prompt | self.llm | self.parser
46
+
47
+ async def run(self, claim: str) -> Dict[str, Any]:
48
+ log.info(f"FactCheckAgent verifying: {claim[:60]}...")
49
+
50
+ # Step 1: Use tool to get raw fact-check data
51
+ raw_result = await self.tool._search(claim)
52
+ tool_output = str(raw_result)
53
+
54
+ # Step 2: LLM makes final reasoned verdict
55
+ try:
56
+ verdict = await self.chain.ainvoke({
57
+ "claim": claim,
58
+ "tool_result": tool_output,
59
+ "format_instructions": self.parser.get_format_instructions()
60
+ })
61
+
62
+ return {
63
+ "agent": "fact_checker",
64
+ "claim": claim,
65
+ "verdict": verdict,
66
+ "raw_tool_result": raw_result,
67
+ }
68
+
69
+ except Exception as e:
70
+ log.error(f"LLM failed in FactCheckAgent: {e}")
71
+ return {
72
+ "agent": "fact_checker",
73
+ "claim": claim,
74
+ "verdict": {
75
+ "verdict": "unverified",
76
+ "confidence": 0.1,
77
+ "explanation": "Fact-check processing failed",
78
+ "sources": []
79
+ },
80
+ "error": str(e)
81
+ }
app/services/fact_checker/tools.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agents/fact_checker/tool.py
2
+ import asyncio
3
+ import hashlib
4
+ import logging
5
+ from typing import Dict, List, Optional
6
+
7
+ import aiohttp
8
+ from langchain_core.tools import tool
9
+
10
+ from app.core.config import config
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+
15
+ class GoogleFactCheckTool:
16
+ """LangChain tool that verifies claims using Google Fact Check Tools API"""
17
+
18
+ def __init__(self, api_key: str):
19
+ self.api_key = api_key or config.GOOGLE_FACT_CHECK_KEY
20
+ self.base_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
21
+ self.cache: Dict[str, dict] = {}
22
+
23
+ def _hash(self, claim: str) -> str:
24
+ return hashlib.sha256(claim.lower().strip().encode()).hexdigest()
25
+
26
+ async def _search(self, claim: str) -> dict:
27
+ if cached := self.cache.get(self._hash(claim)):
28
+ return cached
29
+
30
+ if not self.api_key:
31
+ return {"status": "error", "reason": "API key missing"}
32
+
33
+ params = {"query": claim, "key": self.api_key, "languageCode": "en"}
34
+ try:
35
+ async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10)) as session:
36
+ async with session.get(self.base_url, params=params) as resp:
37
+ data = await resp.json() if resp.status == 200 else {}
38
+ result = self._parse(data.get("claims", []), claim)
39
+ self.cache[self._hash(claim)] = result
40
+ return result
41
+ except Exception as e:
42
+ log.warning(f"Fact-check API error: {e}")
43
+ return {"status": "unverified", "reason": "API error"}
44
+
45
+ def _parse(self, claims: List[dict], original: str) -> dict:
46
+ if not claims:
47
+ return {
48
+ "status": "unverified",
49
+ "claim": original,
50
+ "reason": "No fact-checks found",
51
+ }
52
+
53
+ review = claims[0].get("claimReview", [{}])[0]
54
+ rating = review.get("textualRating", "").lower()
55
+
56
+ status_map = {
57
+ "false": "debunked", "pants": "debunked", "incorrect": "debunked",
58
+ "true": "verified", "accurate": "verified",
59
+ "mixture": "mixture", "half": "mixture", "mostly": "mixture",
60
+ }
61
+ status = next((v for k, v in status_map.items() if k in rating), "unverified")
62
+
63
+ return {
64
+ "status": status,
65
+ "claim": original,
66
+ "textual_rating": review.get("textualRating"),
67
+ "source_url": review.get("url"),
68
+ "fact_checker": review.get("publisher", {}).get("name"),
69
+ "review_date": review.get("reviewDate"),
70
+ }
71
+
72
+ # LangChain Tool
73
+ @tool("google_fact_check")
74
+ async def google_fact_check(self, claim: str) -> str:
75
+ """
76
+ Use this tool to verify factual claims against professional fact-checkers.
77
+ Input: A single factual claim (e.g., "The Earth is flat")
78
+ Output: Verification result with source
79
+ """
80
+ result = await self._search(claim)
81
+ if result["status"] in ["verified", "debunked"]:
82
+ return f"Fact-check result: {result['textual_rating']} by {result['fact_checker']}. Source: {result['source_url']}"
83
+ return f"No reliable fact-check found for: {claim}"
84
+
app/services/identify/__init__.py ADDED
File without changes
app/services/identify/agent.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import uuid
4
+ from typing import Dict, Any, Optional
5
+
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_core.output_parsers import JsonOutputParser
8
+ from pydantic import BaseModel, Field
9
+
10
+ from app.services.identify.tools import SourceCredibilityTool
11
+ from app.services.llm_wrapper import llm_wrapper
12
+
13
+ from app.core.config import config
14
+ from app.core.models import CredibilityVerdict
15
+
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.INFO)
18
+
19
+
20
+ class SourceCredibilityAgent:
21
+ """
22
+ Agent responsible for assessing the credibility of a source URL.
23
+ Uses raw tools to gather data and an LLM to analyze and produce a verdict.
24
+ """
25
+
26
+ def __init__(self):
27
+ self.llm = llm_wrapper.get_llm()
28
+ self.tool = SourceCredibilityTool()
29
+ self.output_parser = JsonOutputParser(
30
+ pydantic_object=CredibilityVerdict
31
+ )
32
+ self.prompt = ChatPromptTemplate.from_messages([
33
+ ("system", """
34
+ You are a senior fact-checking analyst specializing in source credibility evaluation.
35
+
36
+ Using the technical signals below, produce a final credibility verdict.
37
+
38
+ Guidelines:
39
+ - Be strict: new domains (<6 months), no SSL history, or malicious verdicts → very_low
40
+ - Established domains (>3 years), clean records → high
41
+ - Heavy trackers/ads + obscure ASN → downgrade
42
+ - Never trust sites flagged by Google Safe Browsing or urlscan.io as malicious
43
+ - Bias: infer only if strong patterns (e.g., known partisan ASN or domain name)
44
+ - BE CONCISE in your final verdict summary.
45
+ - BE CONSISTENT between trust_level and score.
46
+
47
+ Return valid JSON only.
48
+ {format_instructions}
49
+ """.strip()),
50
+ ("human", "Assess credibility of this source:\n\n{report_json}")
51
+ ])
52
+
53
+ self.chain = self.prompt | self.llm | self.output_parser
54
+
55
+ async def run(self, url: str) -> CredibilityVerdict:
56
+ """
57
+ Main method to run the Source Credibility Agent.
58
+
59
+ Args:
60
+ url (str): The URL of the source to assess.
61
+
62
+ Returns:
63
+ CredibilityVerdict: The credibility verdict of the source.
64
+ """
65
+ logger.info(f"Assessing credibility for URL: {url}")
66
+
67
+ output_report = await self.tool.check_source_credibility.ainvoke(url)
68
+
69
+ try:
70
+ # logger.info(f"Generating credibility verdict using LLM using prompt: {self.prompt}.")
71
+ verdict = await self.chain.ainvoke({
72
+ "report_json": json.dumps(output_report, indent=2),
73
+ "format_instructions": self.output_parser.get_format_instructions()
74
+ })
75
+ # logger.info(f"Generated verdict: {verdict}")
76
+
77
+ final_verdict = {
78
+ "url": url,
79
+ "trust_level": verdict.get("trust_level"),
80
+ "score": verdict.get("score"),
81
+ "red_flags": verdict.get("red_flags"),
82
+ "summary": verdict.get("summary"),
83
+ "source_used": verdict.get("source_used") if verdict.get("source_used") else [url]
84
+ }
85
+ # logger.info(f"Credibility verdict for {url}: {final_verdict}")
86
+
87
+ return final_verdict
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error generating credibility verdict for {url}: {str(e)}")
91
+ return {
92
+ "url": url,
93
+ "trust_level": "unknown",
94
+ "score": 0.0,
95
+ "red_flags": ["error_generating_verdict"],
96
+ "summary": "Could not generate credibility verdict due to an error.",
97
+ "source_used": [url]
98
+ }
99
+
100
+ # # Example usage:
101
+ # async def main():
102
+ # url = "https://databackedafrica.com/"
103
+ # agent = SourceCredibilityAgent()
104
+ # verdict = await agent.run(url)
105
+ # print(f"Credibility Verdict: {verdict}")
106
+
107
+ # if __name__ == "__main__":
108
+ # import asyncio
109
+ # asyncio.run(main())
app/services/identify/tools.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whois
2
+ import tldextract
3
+ import aiohttp
4
+ import datetime
5
+ import re
6
+ import asyncio
7
+ from urllib.parse import urlparse
8
+ from typing import Optional, Dict, Any
9
+ import os
10
+ from dotenv import load_dotenv
11
+
12
+ from langchain_core.tools import tool
13
+
14
+ load_dotenv()
15
+
16
+ from app.core.config import config
17
+
18
+ # class Config:
19
+ # GOOGLE_APIS_KEY: Optional[str] = os.getenv("GOOGLE_APIS_KEY")
20
+ # FIRECRAWL_API_KEY: Optional[str] = os.getenv("FIRECRAWL_API_KEY")
21
+ # URLSCAN_API_KEY: Optional[str] = os.getenv("URLSCAN_API_KEY")
22
+
23
+ # config = Config()
24
+
25
+ import logging
26
+
27
+ logger = logging.getLogger(__name__)
28
+ logging.basicConfig(level=logging.INFO)
29
+
30
+
31
+ class SourceCredibilityTool:
32
+ """
33
+ A collection of tools for verifying sources URLs.
34
+
35
+ """
36
+ @staticmethod
37
+ def extract_domain(url: str) -> str:
38
+ """
39
+ Extract the domain from a given URL.
40
+ """
41
+ extracted = tldextract.extract(url)
42
+ logger.info(f"Extracted components: {extracted}")
43
+ if not extracted.suffix:
44
+ logger.warning(f"No suffix found for URL: {url}")
45
+ return "unknown"
46
+ domain = f"{extracted.domain}.{extracted.suffix}"
47
+ logger.info(f"Extracted domain: {domain}")
48
+ return domain
49
+
50
+ @staticmethod
51
+ async def _submit_to_urlscan(url: str) -> Optional[str]:
52
+ """
53
+ Submit a URL to urlscan.io for analysis and return the scan ID.
54
+ """
55
+ api_key = config.URLSCAN_API_KEY
56
+ if not api_key:
57
+ logger.error("URLSCAN_API_KEY is not set in the environment variables.")
58
+ return None
59
+
60
+ submit_url = "https://urlscan.io/api/v1/scan/"
61
+ headers = {
62
+ 'Content-Type': 'application/json',
63
+ 'API-Key': api_key,
64
+ }
65
+ # logger.info(f"Headers for urlscan.io submission: {headers}")
66
+ data = {
67
+ 'url': url,
68
+ 'visibility': 'public'
69
+ }
70
+
71
+ try:
72
+ async with aiohttp.ClientSession() as session:
73
+ async with session.post(submit_url, json=data, headers=headers) as response:
74
+ if response.status == 200:
75
+ resp_json = await response.json()
76
+ scan_id = resp_json.get('uuid')
77
+ result_url = f"https://urlscan.io/api/v1/result/{scan_id}/"
78
+ # logger.info(f"Submitted URL to urlscan.io: {data.get("result") or result_url}")
79
+ return data.get("result") or result_url
80
+ else:
81
+ text = await response.text()
82
+ logger.error(f"Failed to submit URL to urlscan.io, status code: {response.status} {text}")
83
+ return None
84
+ except aiohttp.ClientError as e:
85
+ logger.error(f"Error submitting URL to urlscan.io: {e}")
86
+ return None
87
+
88
+ @staticmethod
89
+ async def _fetch_urlscan_result(result_url: str) -> Optional[Dict[str, Any]]:
90
+ """
91
+ Fetch the result of a urlscan.io analysis.
92
+ """
93
+ api_key = config.URLSCAN_API_KEY
94
+ if not api_key:
95
+ logger.error("URLSCAN_API_KEY is not set in the environment variables.")
96
+ return None
97
+
98
+ headers = {
99
+ 'API-Key': api_key,
100
+ }
101
+
102
+ try:
103
+ async with aiohttp.ClientSession() as session:
104
+ async with session.get(result_url, headers=headers) as response:
105
+ if response.status == 200:
106
+ resp_json = await response.json()
107
+ # logger.info(f"Fetched urlscan.io result from: {result_url}")
108
+ return resp_json
109
+ else:
110
+ text = await response.text()
111
+ logger.error(f"Failed to fetch urlscan.io result, status code: {response.status} {text}")
112
+ return None
113
+ except aiohttp.ClientError as e:
114
+ logger.error(f"Error fetching urlscan.io result: {e}")
115
+ return None
116
+
117
+ def extract_credibility_signals(urlscan_result: Dict[str, Any]) -> Dict[str, Any]:
118
+ data = urlscan_result
119
+ page = data.get("page", {})
120
+ stats = data.get("stats", {})
121
+ verdicts = data.get("verdicts", {})
122
+ task = data.get("task", {})
123
+ lists = data.get("lists", {})
124
+
125
+ return {
126
+ "url": task.get("url"),
127
+ "scan_date": task.get("time"),
128
+ "screenshot_url": task.get("screenshotURL"),
129
+
130
+ # Critical verdicts
131
+ "malicious_detected": verdicts.get("overall", {}).get("malicious", False),
132
+ "engine_detections": verdicts.get("engines", {}).get("maliciousTotal", 0),
133
+ "suspicious_categories": verdicts.get("overall", {}).get("categories", []),
134
+
135
+ # Domain & TLS age
136
+ "domain_age_days": page.get("apexDomainAgeDays", 0),
137
+ "tls_age_days": page.get("tlsAgeDays", 0),
138
+ "is_new_domain": page.get("apexDomainAgeDays", 9999) < 180,
139
+ "is_brand_new_tls": page.get("tlsAgeDays", 9999) < 60,
140
+
141
+ # Security posture
142
+ "secure_percentage": stats.get("securePercentage", 100),
143
+ "uses_mixed_content": stats.get("securePercentage", 100) < 98,
144
+
145
+ # Hosting
146
+ "server": page.get("server"),
147
+ "asn": page.get("asn"),
148
+ "asn_name": page.get("asnname"),
149
+ "ip": page.get("ip"),
150
+
151
+ # Privacy / trackers (approximate)
152
+ "total_requests": sum(s.get("count", 0) for s in stats.get("resourceStats", [])),
153
+ "third_party_domains": len(lists.get("domains", [])) - 1,
154
+
155
+ # Suspicious patterns
156
+ "has_data_urls": any("data:" in r.get("request", {}).get("url", "") for r in data.get("data", {}).get("requests", [])),
157
+ "redirects_to_suspicious": any(
158
+ tldextract.extract(url).domain in ["bit", "tinyurl"] or tldextract.extract(url).suffix in ["ru", "xyz", "top"]
159
+ for url in lists.get("linkDomains", [])
160
+ ),
161
+
162
+ # Bonus: popularity
163
+ "umbrella_rank": next(
164
+ (item["rank"] for item in data.get("meta", {}).get("processors", {}).get("umbrella", {}).get("data", []) if item["hostname"] == page.get("domain")),
165
+ None
166
+ ),
167
+ }
168
+
169
+
170
+ @staticmethod
171
+ @tool("check_source_credibility")
172
+ async def check_source_credibility(url: str) -> Dict[str, Any]:
173
+ """
174
+ Check the credibility of a source URL using urlscan.io.
175
+ Returns a dictionary with credibility information.
176
+ """
177
+ result = {
178
+ "url": url,
179
+ "domain": SourceCredibilityTool.extract_domain(url),
180
+ "urlscan_result": None,
181
+ "verdict": None,
182
+ "is_malicious": None,
183
+ "suspicious": None,
184
+ "categories": []
185
+ }
186
+
187
+ result_url = await SourceCredibilityTool._submit_to_urlscan(url)
188
+ if not result_url:
189
+ logger.error(f"Could not submit URL to urlscan.io: {url}")
190
+ return result
191
+
192
+ urlscan_data = None
193
+ if result_url:
194
+ for _ in range(10): # Retry up to 10 times
195
+ await asyncio.sleep(5) # Wait before retrying
196
+ urlscan_data = await SourceCredibilityTool._fetch_urlscan_result(result_url)
197
+ if urlscan_data:
198
+ break
199
+
200
+ urlscan_insights = {}
201
+
202
+ if urlscan_data:
203
+ result["urlscan_result"] = urlscan_data
204
+ credibitility_signals = SourceCredibilityTool.extract_credibility_signals(urlscan_data)
205
+ urlscan_insights.update(credibitility_signals)
206
+
207
+
208
+ return urlscan_insights
209
+
210
+
211
+
212
+ # # # Example usage:
213
+ # async def main():
214
+ # url = "https://bit.ly/3X9kP2m/"
215
+ # identifier = SourceCredibilityTool()
216
+
217
+ # domain = identifier.extract_domain(url)
218
+ # print(f"Extracted domain: {domain}")
219
+
220
+ # credibility = await identifier.check_source_credibility.ainvoke(url)
221
+ # print(f"Source credibility report: {credibility}")
222
+
223
+ # if __name__ == "__main__":
224
+ # import asyncio
225
+ # asyncio.run(main())
app/services/llm_wrapper.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import List, Dict, Any, Optional
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
6
+ from dotenv import load_dotenv
7
+
8
+ from app.core.config import config
9
+
10
+ load_dotenv() # Load environment variables from a .env file if present
11
+
12
+ class LLMWrapper:
13
+ """
14
+ Centralized LLM Wrapper for the Verifacts System.
15
+
16
+ Standardizes model configurations, message formatting, and response handling.
17
+ """
18
+
19
+ _instance = None
20
+
21
+ def __init__(self):
22
+ self.model_name = config.LLM_MODEL_NAME
23
+ self.temperature = config.LLM_TEMPERATURE
24
+ self.max_tokens = config.LLM_MAX_TOKEN
25
+ self.api_key = config.GEMINI_API_KEY
26
+
27
+ if not self.api_key:
28
+ raise ValueError("GEMINI_API_KEY is not set in the environment variables.")
29
+ self.llm = None
30
+
31
+ self.llm = ChatGoogleGenerativeAI(
32
+ model=self.model_name,
33
+ temperature=self.temperature,
34
+ max_output_tokens=self.max_tokens,
35
+ api_key=self.api_key
36
+ )
37
+
38
+ @classmethod
39
+ def get_instance(cls):
40
+ if cls._instance is None:
41
+ cls._instance = cls()
42
+ return cls._instance
43
+
44
+
45
+ def get_llm(self):
46
+ """Returns the underlying LLM instance."""
47
+ return self.llm
48
+
49
+
50
+ llm_wrapper = LLMWrapper.get_instance()
51
+
app/services/orchestrator.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import Dict, TypedDict, Annotated, List
4
+
5
+ from langchain_core.runnables import Runnable
6
+ from langgraph.graph import StateGraph, END
7
+ from langgraph.checkpoint.memory import MemorySaver # For state persistence
8
+ from redis import Redis # pip install redis
9
+ from langchain_community.cache import RedisCache
10
+
11
+ from app.services.identify.agent import SourceCredibilityAgent
12
+ from app.services.claims.agent import ClaimExtractionAgent
13
+ from app.services.fact_checker.agent import FactCheckAgent
14
+ from app.core.config import config
15
+ from app.services.shared_tools import tavily_search
16
+ from app.services.llm_wrapper import llm_wrapper
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+ from app.core.models import FinalReport
19
+ from langchain_core.output_parsers import JsonOutputParser
20
+ from langgraph.checkpoint.memory import MemorySaver
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+ logger.setLevel(logging.INFO)
25
+
26
+ class WorkflowState(TypedDict):
27
+ url: str
28
+ selection: str
29
+ credibility: Annotated[Dict, "Source credibility report"]
30
+ claims: Annotated[List[Dict], "Extracted claims"]
31
+ fact_checks: Annotated[List[Dict], "Fact check verdicts"]
32
+ search_insights: Annotated[List[Dict], "Tavily search results with snippets for enrichment"]
33
+ error: Annotated[str, "Error message, if any"]
34
+
35
+
36
+ # === Agent Nodes ===
37
+ async def credibility_node(state: WorkflowState) -> WorkflowState:
38
+ agent = SourceCredibilityAgent()
39
+ try:
40
+ url = state.get("url")
41
+ if not url:
42
+ state["error"] = "No URL provided for credibility check"
43
+ return state
44
+ report = await agent.run(url) # Make sure agent.run() accepts url as string
45
+ state["credibility"] = report
46
+ logger.info(f"Credibility report: {report}")
47
+ trust_level = report.get("trust_level", "unknown")
48
+ if trust_level in ["low", "very_low"]:
49
+ state["error"] = "Source credibility too low to proceed"
50
+ except Exception as e:
51
+ logger.error(f"Credibility check error: {str(e)}")
52
+ state["error"] = f"Credibility check failed: {str(e)}"
53
+ return state
54
+
55
+ async def extraction_node(state: WorkflowState) -> WorkflowState:
56
+ if state.get("error"):
57
+ return state # Skip if previous error
58
+ agent = ClaimExtractionAgent()
59
+ try:
60
+ # Build verdict dict from state to pass to agent
61
+ verdict = {
62
+ "url": state.get("url"),
63
+ "selection": state.get("selection"),
64
+ "trust_level": state.get("credibility", {}).get("trust_level"),
65
+ "score": state.get("credibility", {}).get("score"),
66
+ }
67
+ claims = await agent.run(verdict) # Pass verdict to agent
68
+ logger.info(f"Extracted {len(claims)} claims")
69
+ state["claims"] = [c.text for c in claims if c.claim_type == "factual"]
70
+ except Exception as e:
71
+ logger.error(f"Claim extraction error: {str(e)}")
72
+ state["error"] = f"Claim extraction failed: {str(e)}"
73
+ return state
74
+
75
+ async def factcheck_node(state: WorkflowState) -> WorkflowState:
76
+ if state.get("error") or not state.get("claims"):
77
+ return state # Skip if previous error or no claims
78
+ agent = FactCheckAgent()
79
+ try:
80
+ fact_checks = []
81
+ for claim in state["claims"]:
82
+ result = await agent.run(claim)
83
+ logger.info(f"Fact-check result for claim '{claim[:30]}...': {result}")
84
+ fact_checks.append(result)
85
+ state["fact_checks"] = fact_checks
86
+ except Exception as e:
87
+ state["error"] = f"Fact-checking failed: {str(e)}"
88
+ return state
89
+
90
+ # === NEW: Tavily Enrichment (Always runs after extraction) ===
91
+ async def search_enrichment_node(state: WorkflowState) -> WorkflowState:
92
+ if state.get("error") or not state.get("claims"): return state
93
+
94
+ insights = []
95
+ for claim in state["claims"]:
96
+ try:
97
+ query = f"fact check: {claim} site:reputable"
98
+ results = await tavily_search.ainvoke(query=query, max_results=3)
99
+ insights.append({
100
+ "claim": claim,
101
+ "results": results, # Includes snippets, answers, sources
102
+ "sources": [r["url"] for r in results]
103
+ })
104
+ except Exception as e:
105
+ logger.warning(f"Tavily failed for claim '{claim}': {e}")
106
+
107
+ state["search_insights"] = insights
108
+ return state
109
+
110
+ # === NEW: Compile Final Report ===
111
+ async def compile_report_node(state: WorkflowState) -> WorkflowState:
112
+ # LLM summarizes overall
113
+ prompt = ChatPromptTemplate.from_template("""
114
+ You are a fact-check report compiler. Analyze the following state and generate a final report.
115
+
116
+ State:
117
+ - URL: {url}
118
+ - Source Credibility: {credibility}
119
+ - Claims Extracted: {claims}
120
+ - Fact Check Results: {fact_checks}
121
+ - Search Insights: {search_insights}
122
+
123
+ Rules for verdict:
124
+ - If most claims are verified → "verified"
125
+ - If most claims are debunked → "debunked"
126
+ - If mixed results → "mixture"
127
+ - If insufficient evidence �� "unverified"
128
+
129
+ {format_instructions}
130
+
131
+ Respond ONLY with valid JSON. Do not include any markdown formatting, explanations, or text outside the JSON object.
132
+ """)
133
+ llm = llm_wrapper.get_llm()
134
+ output_parser = JsonOutputParser(pydantic_object=FinalReport)
135
+ chain = prompt | llm | output_parser
136
+
137
+ try:
138
+ compiled = await chain.ainvoke({
139
+ "url": state.get("url", ""),
140
+ "credibility": state.get("credibility", {}),
141
+ "claims": state.get("claims", []),
142
+ "fact_checks": state.get("fact_checks", []),
143
+ "search_insights": state.get("search_insights", []),
144
+ "format_instructions": output_parser.get_format_instructions()
145
+ })
146
+ logger.info(f"Compiled report: {compiled}")
147
+ state["overall_verdict"] = compiled.get("overall_verdict", "unverified")
148
+ state["summary"] = compiled.get("summary", "No summary generated")
149
+ state["sources"] = [s for insight in state.get("search_insights", []) for s in insight["sources"]]
150
+ except Exception as e:
151
+ logger.error(f"Report compilation error: {str(e)}")
152
+ # Fallback: Create a basic report without LLM
153
+ state["overall_verdict"] = "unverified"
154
+ state["summary"] = f"Report compilation failed. {len(state.get('claims', []))} claims extracted, {len(state.get('fact_checks', []))} fact-checks completed."
155
+ state["sources"] = [s for insight in state.get("search_insights", []) for s in insight.get("sources", [])]
156
+ return state
157
+
158
+ def decide_next_step(state: WorkflowState) -> str:
159
+ cred = state.get("credibility", {}).get("verdict", {}).get("trust_level", "unknown")
160
+ if cred in ["low", "very_low"]:
161
+ return END # Still skip if very low
162
+ return "extraction_node"
163
+
164
+ # === Orchestrator ===
165
+ workflow = StateGraph(state_schema=WorkflowState)
166
+
167
+
168
+ workflow.add_node("credibility_node", credibility_node)
169
+ workflow.add_node("extraction_node", extraction_node)
170
+ workflow.add_node("search_enrichment_node", search_enrichment_node)
171
+ workflow.add_node("factcheck_node", factcheck_node)
172
+ workflow.add_node("compile_report_node", compile_report_node)
173
+
174
+ workflow.set_entry_point("credibility_node")
175
+
176
+ workflow.add_conditional_edges(
177
+ "credibility_node", decide_next_step
178
+ )
179
+ workflow.add_edge("extraction_node", "search_enrichment_node")
180
+ workflow.add_edge("search_enrichment_node", "factcheck_node")
181
+ workflow.add_edge("factcheck_node", "compile_report_node")
182
+ workflow.add_edge("compile_report_node", END)
183
+
184
+
185
+ memory = MemorySaver()
186
+ graph = workflow.compile(checkpointer=memory)
187
+
188
+
189
+ async def run_orchestrator(url: str, selection:str) -> WorkflowState:
190
+ initial_state: WorkflowState = {
191
+ "url": url,
192
+ "selection": selection,
193
+ "credibility": {},
194
+ "claims": [],
195
+ "fact_checks": [],
196
+ "error": "",
197
+ }
198
+ final_state = await graph.ainvoke(initial_state, config={"configurable": {"thread_id": "main"}})
199
+ return final_state
200
+
201
+ # Example usage
202
+ if __name__ == "__main__":
203
+ test_url = "https://www.nbcnews.com/politics/donald-trump/trump-cnn-warner-bros-discovery-netflix-paramount-rcna248518"
204
+ test_selection = "Paramount initiated a hostile bid, offering shareholders $30 per share."
205
+
206
+ result_state = asyncio.run(run_orchestrator(test_url, test_selection))
207
+ if result_state.get("error"):
208
+ logger.error(f"Orchestration failed: {result_state['error']}")
209
+ else:
210
+ logger.info(f"Orchestration completed successfully. Fact-checks: {result_state['fact_checks']}")
app/services/shared_tools.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.tools import tool
2
+ from app.core.cache import cache_get, cache_set, cache_delete, cache_stats
3
+ from app.core.config import config
4
+ from langchain_community.tools.tavily_search import TavilySearchResults
5
+
6
+ @tool("cache_query")
7
+ async def cache_query(key: str) -> str:
8
+ """
9
+ Query a value from the global cache. Use to check if data is cached.
10
+ Input: cache key (e.g., "claim:XYZ")
11
+ """
12
+ value = cache_get(key)
13
+ return str(value) if value else "Not found in cache"
14
+
15
+ @tool("cache_invalidate")
16
+ async def cache_invalidate(key: str) -> str:
17
+ """
18
+ Delete a key from global cache. Use to force refresh.
19
+ Input: cache key
20
+ """
21
+ deleted = cache_delete(key)
22
+ return "Deleted" if deleted else "Key not found"
23
+
24
+ @tool("cache_stats")
25
+ async def get_cache_stats() -> str:
26
+ """
27
+ Get global cache statistics. Use to monitor cache health.
28
+ """
29
+ return str(cache_stats())
30
+
31
+ @tool("tavily_search")
32
+ async def tavily_search(query: str, max_results: int = 5) -> str:
33
+ """
34
+ Advanced AI-powered web search. Use for complex research or when standard search lacks context.
35
+ Returns summarized results with sources.
36
+ """
37
+ tool = TavilySearchResults(
38
+ max_results=max_results,
39
+ api_key=config.TAVILY_API_KEY, # Add to .env
40
+ search_depth = "advanced",
41
+ include_answer = True,
42
+ include_raw_content =True
43
+ )
44
+ results = await tool.ainvoke(input=query)
45
+ return str(results) # Or parse to dict
46
+
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "verifacts-backend"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "Testimony Adekoya"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.10, <4.0.0"
10
+ dependencies = [
11
+ "langchain-core (>=1.1.0,<2.0.0)",
12
+ "langchain-community (>=0.4.1,<0.5.0)",
13
+ "fastapi (>=0.121.3,<0.122.0)",
14
+ "uvicorn[standard] (>=0.38.0,<0.39.0)",
15
+ "pydantic (>=2.12.4,<3.0.0)",
16
+ "sqlalchemy (>=2.0.44,<3.0.0)",
17
+ "redis (>=7.1.0,<8.0.0)",
18
+ "httpx (>=0.28.1,<0.29.0)",
19
+ "python-multipart (>=0.0.20,<0.0.21)",
20
+ "langgraph (>=1.0.3,<2.0.0)",
21
+ "langchain-google-genai (>=3.1.0,<4.0.0)",
22
+ "python-dotenv (>=1.2.1,<2.0.0)",
23
+ "pytest (>=9.0.1,<10.0.0)",
24
+ "python-whois (>=0.9.6,<0.10.0)",
25
+ "tldextract (>=5.3.0,<6.0.0)",
26
+ "firecrawl (>=4.9.0,<5.0.0)",
27
+ "resend (>=2.19.0,<3.0.0)",
28
+ "newspaper4k (>=0.9.4.1,<0.10.0.0)",
29
+ "python-json-logger (>=4.0.0,<5.0.0)",
30
+ "langchain (>=1.1.3,<2.0.0)",
31
+ "tavily-python (>=0.7.14,<0.8.0)",
32
+ "langchain-openai (>=1.1.1,<2.0.0)",
33
+ "langchain-tavily (>=0.2.13,<0.3.0)"
34
+ ]
35
+
36
+
37
+ [build-system]
38
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
39
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pydantic
4
+ sqlalchemy
5
+ pydantic[email]
6
+ alembic
7
+ redis
8
+ httpx
9
+ python-multipart
10
+ langchain
11
+ langchain-core
12
+ langgraph
13
+ langchain-community
14
+ langchain-google-genai
tests/test_api.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from fastapi.testclient import TestClient
3
+ from unittest.mock import patch, AsyncMock
4
+
5
+ from app.api.main import main
6
+
7
+ client = TestClient(main)
8
+
9
+
10
+ @pytest.fixture
11
+ def mock_graph_response():
12
+ """
13
+ Returns a fake state object that simulates a completed AI analysis.
14
+ """
15
+ return {
16
+ "is_verified_entity": True,
17
+ "identity_score": 0.85,
18
+ "verdict_status": "Verified",
19
+ "extracted_claims": ["Claim 1", "Claim 2"],
20
+ "claims_verified_count": 2,
21
+ "claims_sourced_count": 2,
22
+ "verification_results": [{"claim": "Claim 1", "status": "True"}],
23
+ "agent_reports": [
24
+ {
25
+ "agent_name": "Firecrawl Reader",
26
+ "output": ["Claim 1", "Claim 2"],
27
+ "errors": []
28
+ }
29
+ ]
30
+ }
31
+
32
+ def test_health_check():
33
+ response = client.get("/health")
34
+ assert response.status_code == 200
35
+ data = response.json()
36
+ assert data["status"] == "operational"
37
+ assert "version" in data == "`1.0.0`"
38
+
39
+
40
+ @patch("app.api.v1.endpoints.verifacts_pipeline.ainvoke", new_callable=AsyncMock)
41
+ def test_analyze_content(mock_ainvoke, mock_graph_response):
42
+ """
43
+ Test the /analyze endpoint with a mocked AI graph response.
44
+ """
45
+ mock_ainvoke.return_value = mock_graph_response
46
+
47
+ request_payload = {
48
+ "url": "https://example.com/article",
49
+ "selection": None,
50
+ "force_refresh": False
51
+ }
52
+
53
+ response = client.post("/api/v1/analyze", json=request_payload)
54
+ assert response.status_code == 200
55
+
56
+ data = response.json()
57
+ assert data["status"] == "Completed"
58
+ assert data["verdict"]["status"] == "Verified"
59
+ assert data["verdict"]["claims_verified"] == 2
60
+ assert data["identity"]["verified"] is True
61
+ assert data["identity"]["score"] == 0.85
62
+ assert len(data["details"]["reports"]) == 1
63
+ assert data["details"]["reports"][0]["agent"] == "Firecrawl Reader"
64
+
65
+ @patch("app.api.v1.endpoints.verifacts_pipeline.ainvoke", new_callable=AsyncMock)
66
+ def test_analyze_content_with_selection(mock_ainvoke, mock_graph_response):
67
+ """
68
+ Test the /analyze endpoint with a text selection and mocked AI graph response.
69
+ """
70
+ mock_ainvoke.return_value = mock_graph_response
71
+
72
+ request_payload = {
73
+ "url": "https://example.com/article",
74
+ "selection": "Some specific text from the article.",
75
+ "force_refresh": True
76
+ }
77
+
78
+ response = client.post("/api/v1/analyze", json=request_payload)
79
+ assert response.status_code == 200
80
+
81
+ data = response.json()
82
+ assert data["status"] == "Completed"
83
+ assert data["verdict"]["status"] == "Verified"
84
+ assert data["verdict"]["claims_verified"] == 2
85
+ assert data["identity"]["verified"] is True
86
+ assert data["identity"]["score"] == 0.85
87
+ assert len(data["details"]["reports"]) == 1
88
+ assert data["details"]["reports"][0]["agent"] == "Firecrawl Reader"
89
+
90
+ @patch("app.api.v1.endpoints.verifacts_pipeline.ainvoke", new_callable=AsyncMock)
91
+ def test_analyze_validation_error(mock_ainvoke):
92
+ """
93
+ Test the /analyze endpoint with invalid input to trigger validation error.
94
+ """
95
+ request_payload = {
96
+ "url": "not_a_valid_url",
97
+ "selection": None,
98
+ "force_refresh": False
99
+ }
100
+
101
+ response = client.post("/api/v1/analyze", json=request_payload)
102
+ assert response.status_code == 422 # Unprocessable Entity due to validation error