Spaces:
Sleeping
Sleeping
Tawhid Bin Omar commited on
Commit ·
8176754
0
Parent(s):
Initial deployment of RealityCheck AI backend
Browse files- .env.example +1 -0
- .gitignore +34 -0
- Dockerfile +27 -0
- README.md +46 -0
- README_HF.md +35 -0
- analysis/__init__.py +16 -0
- analysis/claim_extractor.py +141 -0
- analysis/consistency_checker.py +133 -0
- analysis/coverage_analyzer.py +179 -0
- analysis/graph_generator.py +200 -0
- analysis/scorer.py +98 -0
- analysis/stability_tester.py +146 -0
- main.py +336 -0
- requirements.txt +13 -0
.env.example
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
HUGGINGFACE_API_KEY=your_huggingface_api_key_here
|
.gitignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
venv/
|
| 8 |
+
env/
|
| 9 |
+
ENV/
|
| 10 |
+
.venv
|
| 11 |
+
|
| 12 |
+
# Environment
|
| 13 |
+
.env
|
| 14 |
+
.env.local
|
| 15 |
+
|
| 16 |
+
# IDE
|
| 17 |
+
.vscode/
|
| 18 |
+
.idea/
|
| 19 |
+
*.swp
|
| 20 |
+
*.swo
|
| 21 |
+
|
| 22 |
+
# OS
|
| 23 |
+
.DS_Store
|
| 24 |
+
Thumbs.db
|
| 25 |
+
|
| 26 |
+
# Testing
|
| 27 |
+
.pytest_cache/
|
| 28 |
+
.coverage
|
| 29 |
+
htmlcov/
|
| 30 |
+
|
| 31 |
+
# Build
|
| 32 |
+
dist/
|
| 33 |
+
build/
|
| 34 |
+
*.egg-info/
|
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Copy requirements
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
|
| 13 |
+
# Install Python dependencies
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
# Download models at build time
|
| 17 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
|
| 18 |
+
RUN python -c "from transformers import pipeline; pipeline('text-classification', model='microsoft/deberta-v3-xsmall')"
|
| 19 |
+
|
| 20 |
+
# Copy application code
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Expose port
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
# Run the application
|
| 27 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: RealityCheck AI Backend
|
| 3 |
+
emoji: 🧠
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
sdk_version: 3.9
|
| 8 |
+
app_port: 7860
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# RealityCheck AI - Backend API
|
| 14 |
+
|
| 15 |
+
Understanding analysis engine that evaluates how well someone understands a concept by analyzing their explanation.
|
| 16 |
+
|
| 17 |
+
## What This Does
|
| 18 |
+
|
| 19 |
+
- Extracts claims from explanations
|
| 20 |
+
- Checks logical consistency
|
| 21 |
+
- Analyzes concept coverage
|
| 22 |
+
- Tests explanation stability
|
| 23 |
+
- Returns understanding scores
|
| 24 |
+
|
| 25 |
+
## API Endpoints
|
| 26 |
+
|
| 27 |
+
- `POST /analyze` - Analyze user explanation
|
| 28 |
+
- `GET /concepts` - Sample concepts list
|
| 29 |
+
- `GET /health` - Health check
|
| 30 |
+
|
| 31 |
+
## Setup
|
| 32 |
+
|
| 33 |
+
This Space requires:
|
| 34 |
+
- `HUGGINGFACE_API_KEY` in Settings → Repository secrets
|
| 35 |
+
|
| 36 |
+
## Tech Stack
|
| 37 |
+
|
| 38 |
+
- FastAPI (Python)
|
| 39 |
+
- Sentence Transformers
|
| 40 |
+
- Mistral-7B-Instruct (via API)
|
| 41 |
+
- DeBERTa (NLI)
|
| 42 |
+
|
| 43 |
+
## Links
|
| 44 |
+
|
| 45 |
+
- Frontend: [Add your Netlify URL]
|
| 46 |
+
- GitHub: [Add your repo URL]
|
README_HF.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: RealityCheck AI Backend
|
| 2 |
+
emoji: 🧠
|
| 3 |
+
colorFrom: blue
|
| 4 |
+
colorTo: indigo
|
| 5 |
+
sdk: docker
|
| 6 |
+
pinned: false
|
| 7 |
+
license: mit
|
| 8 |
+
app_file: main.py
|
| 9 |
+
|
| 10 |
+
# RealityCheck AI - Conceptual Understanding Diagnostic Engine
|
| 11 |
+
|
| 12 |
+
This Space hosts the FastAPI backend for RealityCheck AI, a system that evaluates conceptual understanding through explanation analysis.
|
| 13 |
+
|
| 14 |
+
## Features
|
| 15 |
+
|
| 16 |
+
- Multi-signal understanding analysis
|
| 17 |
+
- Pretrained AI models (no custom training)
|
| 18 |
+
- Logical consistency checking
|
| 19 |
+
- Concept coverage analysis
|
| 20 |
+
- Stability testing
|
| 21 |
+
|
| 22 |
+
## API Endpoints
|
| 23 |
+
|
| 24 |
+
- `POST /analyze` - Analyze user explanation
|
| 25 |
+
- `GET /concepts` - Get sample concepts
|
| 26 |
+
- `GET /health` - Health check
|
| 27 |
+
|
| 28 |
+
## Environment Variables
|
| 29 |
+
|
| 30 |
+
Required:
|
| 31 |
+
- `HUGGINGFACE_API_KEY` - Your Hugging Face API token
|
| 32 |
+
|
| 33 |
+
## Usage
|
| 34 |
+
|
| 35 |
+
See full documentation at: [GitHub Repository URL]
|
analysis/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Analysis module initialization
|
| 2 |
+
from .claim_extractor import ClaimExtractor
|
| 3 |
+
from .graph_generator import ConceptGraphGenerator
|
| 4 |
+
from .consistency_checker import ConsistencyChecker
|
| 5 |
+
from .coverage_analyzer import CoverageAnalyzer
|
| 6 |
+
from .stability_tester import StabilityTester
|
| 7 |
+
from .scorer import UnderstandingScorer
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
'ClaimExtractor',
|
| 11 |
+
'ConceptGraphGenerator',
|
| 12 |
+
'ConsistencyChecker',
|
| 13 |
+
'CoverageAnalyzer',
|
| 14 |
+
'StabilityTester',
|
| 15 |
+
'UnderstandingScorer'
|
| 16 |
+
]
|
analysis/claim_extractor.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Claim Extractor
|
| 3 |
+
Breaks down user explanations into individual claims/statements
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
import os
|
| 8 |
+
import requests
|
| 9 |
+
from sentence_transformers import SentenceTransformer
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
class ClaimExtractor:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 15 |
+
self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
|
| 16 |
+
self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
|
| 17 |
+
self._ready = False
|
| 18 |
+
self._initialize()
|
| 19 |
+
|
| 20 |
+
def _initialize(self):
|
| 21 |
+
"""Initialize models"""
|
| 22 |
+
try:
|
| 23 |
+
# Test embedding model - this takes a few seconds on first run
|
| 24 |
+
test_embedding = self.embedding_model.encode("test")
|
| 25 |
+
self._ready = True
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Claim extractor initialization error: {e}") # TODO: better error handling
|
| 28 |
+
self._ready = False
|
| 29 |
+
|
| 30 |
+
def is_ready(self) -> bool:
|
| 31 |
+
return self._ready
|
| 32 |
+
|
| 33 |
+
async def extract_claims(self, explanation: str) -> List[Dict[str, any]]:
|
| 34 |
+
"""
|
| 35 |
+
Extract atomic claims from user explanation
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
List of claims with metadata:
|
| 39 |
+
- text: the claim itself
|
| 40 |
+
- type: 'definition', 'causal', 'assumption', 'example'
|
| 41 |
+
- embedding: semantic vector
|
| 42 |
+
- confidence: extraction confidence
|
| 43 |
+
"""
|
| 44 |
+
# Use LLM to extract structured claims
|
| 45 |
+
claims_raw = await self._llm_extract_claims(explanation)
|
| 46 |
+
|
| 47 |
+
# Add embeddings and metadata
|
| 48 |
+
claims = []
|
| 49 |
+
for i, claim_text in enumerate(claims_raw):
|
| 50 |
+
embedding = self.embedding_model.encode(claim_text)
|
| 51 |
+
claim_type = self._classify_claim_type(claim_text)
|
| 52 |
+
|
| 53 |
+
claims.append({
|
| 54 |
+
'id': f'claim_{i}',
|
| 55 |
+
'text': claim_text,
|
| 56 |
+
'type': claim_type,
|
| 57 |
+
'embedding': embedding.tolist(),
|
| 58 |
+
'confidence': 0.85 # Simplified for demo
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
return claims
|
| 62 |
+
|
| 63 |
+
async def _llm_extract_claims(self, explanation: str) -> List[str]:
|
| 64 |
+
"""Use LLM to extract atomic claims"""
|
| 65 |
+
prompt = f"""<s>[INST] You are a precise claim extraction system. Break down the following explanation into atomic claims. Each claim should be a single, testable statement.
|
| 66 |
+
|
| 67 |
+
Explanation: {explanation}
|
| 68 |
+
|
| 69 |
+
Extract each claim on a new line, numbered. Focus on:
|
| 70 |
+
1. Definitions (what things are)
|
| 71 |
+
2. Causal relationships (X causes Y)
|
| 72 |
+
3. Assumptions (implicit or explicit)
|
| 73 |
+
4. Properties and characteristics
|
| 74 |
+
|
| 75 |
+
Output only the numbered claims, nothing else. [/INST]"""
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
headers = {"Authorization": f"Bearer {self.hf_api_key}"}
|
| 79 |
+
payload = {
|
| 80 |
+
"inputs": prompt,
|
| 81 |
+
"parameters": {
|
| 82 |
+
"max_new_tokens": 500,
|
| 83 |
+
"temperature": 0.3,
|
| 84 |
+
"return_full_text": False
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
response = requests.post(self.llm_endpoint, headers=headers, json=payload, timeout=30)
|
| 89 |
+
|
| 90 |
+
if response.status_code == 200:
|
| 91 |
+
result = response.json()
|
| 92 |
+
text = result[0]['generated_text'] if isinstance(result, list) else result.get('generated_text', '')
|
| 93 |
+
|
| 94 |
+
# Parse numbered claims
|
| 95 |
+
claims = []
|
| 96 |
+
for line in text.split('\n'):
|
| 97 |
+
line = line.strip()
|
| 98 |
+
# Remove numbering like "1.", "2)", etc.
|
| 99 |
+
if line and (line[0].isdigit() or line.startswith('-')):
|
| 100 |
+
# Clean up the claim
|
| 101 |
+
claim = line.lstrip('0123456789.-) ').strip()
|
| 102 |
+
if claim:
|
| 103 |
+
claims.append(claim)
|
| 104 |
+
|
| 105 |
+
return claims if claims else [explanation] # Fallback to full explanation
|
| 106 |
+
else:
|
| 107 |
+
# Fallback: simple sentence splitting
|
| 108 |
+
return self._fallback_extraction(explanation)
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"LLM extraction error: {e}")
|
| 112 |
+
return self._fallback_extraction(explanation)
|
| 113 |
+
|
| 114 |
+
def _fallback_extraction(self, explanation: str) -> List[str]:
|
| 115 |
+
"""Fallback: simple sentence-based extraction"""
|
| 116 |
+
import re
|
| 117 |
+
sentences = re.split(r'[.!?]+', explanation)
|
| 118 |
+
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
|
| 119 |
+
|
| 120 |
+
def _classify_claim_type(self, claim: str) -> str:
|
| 121 |
+
"""Classify claim type based on linguistic patterns"""
|
| 122 |
+
claim_lower = claim.lower()
|
| 123 |
+
|
| 124 |
+
# Definition patterns
|
| 125 |
+
if any(pattern in claim_lower for pattern in ['is a', 'is the', 'refers to', 'means', 'defined as']):
|
| 126 |
+
return 'definition'
|
| 127 |
+
|
| 128 |
+
# Causal patterns
|
| 129 |
+
elif any(pattern in claim_lower for pattern in ['causes', 'leads to', 'results in', 'because', 'therefore']):
|
| 130 |
+
return 'causal'
|
| 131 |
+
|
| 132 |
+
# Example patterns
|
| 133 |
+
elif any(pattern in claim_lower for pattern in ['for example', 'such as', 'like', 'instance']):
|
| 134 |
+
return 'example'
|
| 135 |
+
|
| 136 |
+
# Assumption patterns
|
| 137 |
+
elif any(pattern in claim_lower for pattern in ['assume', 'given that', 'suppose', 'if']):
|
| 138 |
+
return 'assumption'
|
| 139 |
+
|
| 140 |
+
else:
|
| 141 |
+
return 'statement'
|
analysis/consistency_checker.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Consistency Checker Module
|
| 3 |
+
Uses NLI models to detect logical contradictions and inconsistencies
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
import itertools
|
| 9 |
+
|
| 10 |
+
class ConsistencyChecker:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
try:
|
| 13 |
+
# Use a smaller NLI model for faster inference
|
| 14 |
+
self.nli_model = pipeline(
|
| 15 |
+
"text-classification",
|
| 16 |
+
model="microsoft/deberta-v3-xsmall", # Smaller, faster model
|
| 17 |
+
device=-1 # CPU
|
| 18 |
+
)
|
| 19 |
+
self._ready = True
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f"NLI model initialization error: {e}")
|
| 22 |
+
self._ready = False
|
| 23 |
+
self.nli_model = None
|
| 24 |
+
|
| 25 |
+
def is_ready(self) -> bool:
|
| 26 |
+
return self._ready
|
| 27 |
+
|
| 28 |
+
async def check_consistency(self, claims: List[Dict]) -> Dict:
|
| 29 |
+
"""
|
| 30 |
+
Check logical consistency between claims using NLI
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
{
|
| 34 |
+
'consistency_score': float (0-100),
|
| 35 |
+
'contradictions': List[Dict],
|
| 36 |
+
'circular_definitions': List[Dict],
|
| 37 |
+
'entailment_failures': List[Dict]
|
| 38 |
+
}
|
| 39 |
+
"""
|
| 40 |
+
if not claims or len(claims) < 2:
|
| 41 |
+
return {
|
| 42 |
+
'consistency_score': 100.0,
|
| 43 |
+
'contradictions': [],
|
| 44 |
+
'circular_definitions': [],
|
| 45 |
+
'entailment_failures': []
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
contradictions = []
|
| 49 |
+
circular_refs = []
|
| 50 |
+
|
| 51 |
+
# Check pairwise consistency
|
| 52 |
+
claim_texts = [claim['text'] for claim in claims]
|
| 53 |
+
|
| 54 |
+
for i, j in itertools.combinations(range(len(claim_texts)), 2):
|
| 55 |
+
claim1 = claim_texts[i]
|
| 56 |
+
claim2 = claim_texts[j]
|
| 57 |
+
|
| 58 |
+
# Check for contradiction
|
| 59 |
+
if self._ready and self.nli_model:
|
| 60 |
+
try:
|
| 61 |
+
relation = self._check_entailment(claim1, claim2)
|
| 62 |
+
|
| 63 |
+
if relation == 'contradiction':
|
| 64 |
+
contradictions.append({
|
| 65 |
+
'claim1': claim1,
|
| 66 |
+
'claim2': claim2,
|
| 67 |
+
'confidence': 0.85,
|
| 68 |
+
'suggestion': 'These statements appear to contradict each other. Review the logical relationship.'
|
| 69 |
+
})
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"NLI check error: {e}")
|
| 72 |
+
|
| 73 |
+
# Check for circular definitions (simple heuristic)
|
| 74 |
+
if self._is_circular(claim1, claim2):
|
| 75 |
+
circular_refs.append({
|
| 76 |
+
'claim1': claim1,
|
| 77 |
+
'claim2': claim2
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
# Calculate consistency score
|
| 81 |
+
total_pairs = len(list(itertools.combinations(range(len(claim_texts)), 2)))
|
| 82 |
+
issues = len(contradictions) + len(circular_refs)
|
| 83 |
+
consistency_score = max(0, 100 - (issues / max(total_pairs, 1)) * 100)
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
'consistency_score': consistency_score,
|
| 87 |
+
'contradictions': contradictions[:5], # Limit to top 5
|
| 88 |
+
'circular_definitions': circular_refs[:3],
|
| 89 |
+
'entailment_failures': []
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
def _check_entailment(self, premise: str, hypothesis: str) -> str:
|
| 93 |
+
"""Check logical relationship between two statements"""
|
| 94 |
+
if not self.nli_model:
|
| 95 |
+
return 'neutral'
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
# Prepare input for NLI model
|
| 99 |
+
result = self.nli_model(f"{premise} [SEP] {hypothesis}")
|
| 100 |
+
|
| 101 |
+
# Map label to relationship
|
| 102 |
+
label = result[0]['label'].lower()
|
| 103 |
+
|
| 104 |
+
if 'contradiction' in label or 'contradict' in label:
|
| 105 |
+
return 'contradiction'
|
| 106 |
+
elif 'entailment' in label or 'entail' in label:
|
| 107 |
+
return 'entailment'
|
| 108 |
+
else:
|
| 109 |
+
return 'neutral'
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"Entailment check error: {e}")
|
| 112 |
+
return 'neutral'
|
| 113 |
+
|
| 114 |
+
def _is_circular(self, claim1: str, claim2: str) -> bool:
|
| 115 |
+
"""Simple heuristic to detect circular definitions"""
|
| 116 |
+
# Extract key terms (simple word-based approach)
|
| 117 |
+
words1 = set(claim1.lower().split())
|
| 118 |
+
words2 = set(claim2.lower().split())
|
| 119 |
+
|
| 120 |
+
# Remove common words
|
| 121 |
+
stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
| 122 |
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
| 123 |
+
'should', 'may', 'might', 'can', 'to', 'of', 'in', 'for', 'on', 'with'}
|
| 124 |
+
|
| 125 |
+
words1 = words1 - stopwords
|
| 126 |
+
words2 = words2 - stopwords
|
| 127 |
+
|
| 128 |
+
# Check for high overlap (potential circular definition)
|
| 129 |
+
if len(words1) > 2 and len(words2) > 2:
|
| 130 |
+
overlap = len(words1 & words2)
|
| 131 |
+
return overlap >= min(len(words1), len(words2)) * 0.7
|
| 132 |
+
|
| 133 |
+
return False
|
analysis/coverage_analyzer.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Coverage Analyzer Module
|
| 3 |
+
Analyzes how well user explanation covers canonical concept graph
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
import numpy as np
|
| 8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
+
|
| 10 |
+
class CoverageAnalyzer:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self._ready = True
|
| 13 |
+
|
| 14 |
+
def is_ready(self) -> bool:
|
| 15 |
+
return self._ready
|
| 16 |
+
|
| 17 |
+
async def analyze_coverage(
|
| 18 |
+
self,
|
| 19 |
+
user_claims: List[Dict],
|
| 20 |
+
canonical_graph: Dict,
|
| 21 |
+
explanation: str
|
| 22 |
+
) -> Dict:
|
| 23 |
+
"""
|
| 24 |
+
Analyze concept coverage by matching user claims to graph nodes
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
{
|
| 28 |
+
'coverage_score': float (0-100),
|
| 29 |
+
'node_coverage': Dict[node_id, status],
|
| 30 |
+
'missing_concepts': List[Dict],
|
| 31 |
+
'weak_links': List[Dict],
|
| 32 |
+
'name_dropping': List[str]
|
| 33 |
+
}
|
| 34 |
+
"""
|
| 35 |
+
# Extract embeddings from user claims
|
| 36 |
+
claim_embeddings = [claim['embedding'] for claim in user_claims]
|
| 37 |
+
claim_texts = [claim['text'] for claim in user_claims]
|
| 38 |
+
|
| 39 |
+
# Analyze coverage for each node
|
| 40 |
+
node_coverage = {}
|
| 41 |
+
missing_concepts = []
|
| 42 |
+
weak_links = []
|
| 43 |
+
|
| 44 |
+
for node in canonical_graph['nodes']:
|
| 45 |
+
node_id = node['id']
|
| 46 |
+
node_label = node['label']
|
| 47 |
+
|
| 48 |
+
# Check if concept is mentioned
|
| 49 |
+
coverage_status = self._check_node_coverage(
|
| 50 |
+
node_label=node_label,
|
| 51 |
+
claim_texts=claim_texts,
|
| 52 |
+
claim_embeddings=claim_embeddings,
|
| 53 |
+
explanation=explanation
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
node_coverage[node_id] = coverage_status
|
| 57 |
+
|
| 58 |
+
if coverage_status['status'] == 'missing':
|
| 59 |
+
severity = 'high' if node.get('type') == 'prerequisite' else 'medium'
|
| 60 |
+
missing_concepts.append({
|
| 61 |
+
'concept': node_label,
|
| 62 |
+
'severity': severity,
|
| 63 |
+
'description': f"This is a key {'prerequisite' if node.get('type') == 'prerequisite' else 'component'} for understanding the concept."
|
| 64 |
+
})
|
| 65 |
+
elif coverage_status['status'] == 'weak':
|
| 66 |
+
weak_links.append({
|
| 67 |
+
'concept': node_label,
|
| 68 |
+
'user_quote': coverage_status.get('user_quote', ''),
|
| 69 |
+
'suggestion': 'Explain the mechanism or relationship, not just mention the term.'
|
| 70 |
+
})
|
| 71 |
+
|
| 72 |
+
# Calculate coverage score
|
| 73 |
+
coverage_score = self._calculate_coverage_score(node_coverage, canonical_graph)
|
| 74 |
+
|
| 75 |
+
# Detect name-dropping (mentioned but not explained)
|
| 76 |
+
name_dropping = self._detect_name_dropping(claim_texts, node_coverage)
|
| 77 |
+
|
| 78 |
+
return {
|
| 79 |
+
'coverage_score': coverage_score,
|
| 80 |
+
'node_coverage': node_coverage,
|
| 81 |
+
'missing_concepts': missing_concepts,
|
| 82 |
+
'weak_links': weak_links,
|
| 83 |
+
'name_dropping': name_dropping
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
def _check_node_coverage(
|
| 87 |
+
self,
|
| 88 |
+
node_label: str,
|
| 89 |
+
claim_texts: List[str],
|
| 90 |
+
claim_embeddings: List[List[float]],
|
| 91 |
+
explanation: str
|
| 92 |
+
) -> Dict:
|
| 93 |
+
"""Check if and how well a concept node is covered"""
|
| 94 |
+
# Simple keyword matching first
|
| 95 |
+
node_lower = node_label.lower()
|
| 96 |
+
explanation_lower = explanation.lower()
|
| 97 |
+
|
| 98 |
+
# Check if mentioned at all
|
| 99 |
+
if node_lower not in explanation_lower:
|
| 100 |
+
return {
|
| 101 |
+
'status': 'missing',
|
| 102 |
+
'user_quote': None,
|
| 103 |
+
'coverage_strength': 0.0
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Find best matching claim via semantic similarity
|
| 107 |
+
# (In full implementation, would use actual embeddings of node_label)
|
| 108 |
+
best_match_idx = None
|
| 109 |
+
best_score = 0.0
|
| 110 |
+
|
| 111 |
+
for idx, claim_text in enumerate(claim_texts):
|
| 112 |
+
if node_lower in claim_text.lower():
|
| 113 |
+
# Simple heuristic: longer explanation = better coverage
|
| 114 |
+
coverage_strength = min(1.0, len(claim_text.split()) / 15.0)
|
| 115 |
+
if coverage_strength > best_score:
|
| 116 |
+
best_score = coverage_strength
|
| 117 |
+
best_match_idx = idx
|
| 118 |
+
|
| 119 |
+
if best_match_idx is not None:
|
| 120 |
+
user_quote = claim_texts[best_match_idx]
|
| 121 |
+
|
| 122 |
+
# Determine status based on coverage strength
|
| 123 |
+
if best_score > 0.6:
|
| 124 |
+
status = 'covered'
|
| 125 |
+
elif best_score > 0.2:
|
| 126 |
+
status = 'weak'
|
| 127 |
+
else:
|
| 128 |
+
status = 'missing'
|
| 129 |
+
|
| 130 |
+
return {
|
| 131 |
+
'status': status,
|
| 132 |
+
'user_quote': user_quote,
|
| 133 |
+
'coverage_strength': best_score
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# Mentioned but not in any claim (name-dropping)
|
| 137 |
+
return {
|
| 138 |
+
'status': 'weak',
|
| 139 |
+
'user_quote': None,
|
| 140 |
+
'coverage_strength': 0.1
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
def _calculate_coverage_score(self, node_coverage: Dict, canonical_graph: Dict) -> float:
|
| 144 |
+
"""Calculate overall coverage score"""
|
| 145 |
+
if not node_coverage:
|
| 146 |
+
return 0.0
|
| 147 |
+
|
| 148 |
+
# Weight by node importance
|
| 149 |
+
total_weight = 0.0
|
| 150 |
+
covered_weight = 0.0
|
| 151 |
+
|
| 152 |
+
for node in canonical_graph['nodes']:
|
| 153 |
+
node_id = node['id']
|
| 154 |
+
|
| 155 |
+
# Prerequisites are more important
|
| 156 |
+
weight = 2.0 if node.get('type') == 'prerequisite' else 1.0
|
| 157 |
+
total_weight += weight
|
| 158 |
+
|
| 159 |
+
coverage = node_coverage.get(node_id, {})
|
| 160 |
+
status = coverage.get('status', 'missing')
|
| 161 |
+
|
| 162 |
+
if status == 'covered':
|
| 163 |
+
covered_weight += weight
|
| 164 |
+
elif status == 'weak':
|
| 165 |
+
covered_weight += weight * 0.4
|
| 166 |
+
|
| 167 |
+
return (covered_weight / total_weight * 100) if total_weight > 0 else 0.0
|
| 168 |
+
|
| 169 |
+
def _detect_name_dropping(self, claim_texts: List[str], node_coverage: Dict) -> List[str]:
|
| 170 |
+
"""Detect concepts that are mentioned but not explained"""
|
| 171 |
+
name_dropped = []
|
| 172 |
+
|
| 173 |
+
for node_id, coverage in node_coverage.items():
|
| 174 |
+
if coverage.get('coverage_strength', 0) < 0.3 and coverage.get('user_quote'):
|
| 175 |
+
# Mentioned but weakly explained
|
| 176 |
+
if coverage.get('user_quote'):
|
| 177 |
+
name_dropped.append(coverage['user_quote'])
|
| 178 |
+
|
| 179 |
+
return name_dropped[:3] # Limit to top 3
|
analysis/graph_generator.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Concept Graph Generator
|
| 3 |
+
Generates canonical concept dependency graphs for given concepts
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, List
|
| 7 |
+
import os
|
| 8 |
+
import requests
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
class ConceptGraphGenerator:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
|
| 14 |
+
self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
|
| 15 |
+
self._ready = True
|
| 16 |
+
|
| 17 |
+
def is_ready(self) -> bool:
|
| 18 |
+
return self._ready
|
| 19 |
+
|
| 20 |
+
async def generate_graph(self, concept: str) -> Dict:
|
| 21 |
+
"""
|
| 22 |
+
Generate canonical concept dependency graph
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
{
|
| 26 |
+
'nodes': [{'id': str, 'label': str, 'level': int}],
|
| 27 |
+
'edges': [{'source': str, 'target': str, 'relationship': str}]
|
| 28 |
+
}
|
| 29 |
+
"""
|
| 30 |
+
# Use LLM to generate concept structure
|
| 31 |
+
graph_structure = await self._llm_generate_structure(concept)
|
| 32 |
+
|
| 33 |
+
# Validate and format
|
| 34 |
+
return self._format_graph(graph_structure, concept)
|
| 35 |
+
|
| 36 |
+
async def _llm_generate_structure(self, concept: str) -> Dict:
|
| 37 |
+
"""Use LLM to generate concept prerequisite structure"""
|
| 38 |
+
prompt = f"""<s>[INST] You are a concept structure expert. For the concept "{concept}", identify the core prerequisite concepts that must be understood first, and their relationships.
|
| 39 |
+
|
| 40 |
+
Output a JSON structure with:
|
| 41 |
+
1. "prerequisites": list of prerequisite concepts needed to understand {concept}
|
| 42 |
+
2. "core_components": main parts/aspects of {concept} itself
|
| 43 |
+
3. "relationships": how concepts connect (prerequisite, causal, etc.)
|
| 44 |
+
|
| 45 |
+
Be precise and pedagogical. Focus on understanding order.
|
| 46 |
+
|
| 47 |
+
Output only valid JSON, no other text. [/INST]"""
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
headers = {"Authorization": f"Bearer {self.hf_api_key}"}
|
| 51 |
+
payload = {
|
| 52 |
+
"inputs": prompt,
|
| 53 |
+
"parameters": {
|
| 54 |
+
"max_new_tokens": 800,
|
| 55 |
+
"temperature": 0.4,
|
| 56 |
+
"return_full_text": False
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
response = requests.post(self.llm_endpoint, headers=headers, json=payload, timeout=30)
|
| 61 |
+
|
| 62 |
+
if response.status_code == 200:
|
| 63 |
+
result = response.json()
|
| 64 |
+
text = result[0]['generated_text'] if isinstance(result, list) else result.get('generated_text', '')
|
| 65 |
+
|
| 66 |
+
# Try to parse JSON from response
|
| 67 |
+
try:
|
| 68 |
+
# Extract JSON if wrapped in other text
|
| 69 |
+
start = text.find('{')
|
| 70 |
+
end = text.rfind('}') + 1
|
| 71 |
+
if start != -1 and end > start:
|
| 72 |
+
json_str = text[start:end]
|
| 73 |
+
return json.loads(json_str)
|
| 74 |
+
except:
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
return self._fallback_graph(concept)
|
| 78 |
+
else:
|
| 79 |
+
return self._fallback_graph(concept)
|
| 80 |
+
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"Graph generation error: {e}")
|
| 83 |
+
return self._fallback_graph(concept)
|
| 84 |
+
|
| 85 |
+
def _fallback_graph(self, concept: str) -> Dict:
|
| 86 |
+
"""Fallback: create a basic graph structure"""
|
| 87 |
+
# Predefined templates for common concepts
|
| 88 |
+
templates = {
|
| 89 |
+
'entropy': {
|
| 90 |
+
'prerequisites': ['energy', 'system states', 'probability'],
|
| 91 |
+
'core_components': ['disorder measure', 'thermodynamic entropy', 'information entropy'],
|
| 92 |
+
'relationships': [
|
| 93 |
+
('energy', 'entropy', 'prerequisite'),
|
| 94 |
+
('system states', 'entropy', 'prerequisite'),
|
| 95 |
+
('probability', 'entropy', 'prerequisite')
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
'neural networks': {
|
| 99 |
+
'prerequisites': ['linear algebra', 'calculus', 'probability'],
|
| 100 |
+
'core_components': ['neurons', 'layers', 'weights', 'activation functions', 'backpropagation'],
|
| 101 |
+
'relationships': [
|
| 102 |
+
('linear algebra', 'neural networks', 'prerequisite'),
|
| 103 |
+
('neurons', 'layers', 'component'),
|
| 104 |
+
('weights', 'neurons', 'component'),
|
| 105 |
+
('backpropagation', 'weights', 'causal')
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
'photosynthesis': {
|
| 109 |
+
'prerequisites': ['energy', 'chemical reactions', 'cells'],
|
| 110 |
+
'core_components': ['light reactions', 'dark reactions', 'chlorophyll', 'glucose production'],
|
| 111 |
+
'relationships': [
|
| 112 |
+
('energy', 'light reactions', 'prerequisite'),
|
| 113 |
+
('light reactions', 'dark reactions', 'causal'),
|
| 114 |
+
('dark reactions', 'glucose production', 'causal')
|
| 115 |
+
]
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
# Check if concept matches template
|
| 120 |
+
concept_lower = concept.lower()
|
| 121 |
+
for key, template in templates.items():
|
| 122 |
+
if key in concept_lower:
|
| 123 |
+
return template
|
| 124 |
+
|
| 125 |
+
# Generic fallback
|
| 126 |
+
return {
|
| 127 |
+
'prerequisites': ['foundational knowledge'],
|
| 128 |
+
'core_components': [concept, f'{concept} principles', f'{concept} applications'],
|
| 129 |
+
'relationships': [
|
| 130 |
+
('foundational knowledge', concept, 'prerequisite')
|
| 131 |
+
]
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
def _format_graph(self, structure: Dict, concept: str) -> Dict:
|
| 135 |
+
"""Format graph structure for frontend"""
|
| 136 |
+
nodes = []
|
| 137 |
+
edges = []
|
| 138 |
+
node_id = 0
|
| 139 |
+
node_map = {}
|
| 140 |
+
|
| 141 |
+
# Add prerequisite nodes
|
| 142 |
+
for prereq in structure.get('prerequisites', []):
|
| 143 |
+
node_map[prereq] = f'node_{node_id}'
|
| 144 |
+
nodes.append({
|
| 145 |
+
'id': f'node_{node_id}',
|
| 146 |
+
'label': prereq,
|
| 147 |
+
'level': 0,
|
| 148 |
+
'type': 'prerequisite'
|
| 149 |
+
})
|
| 150 |
+
node_id += 1
|
| 151 |
+
|
| 152 |
+
# Add main concept node
|
| 153 |
+
node_map[concept] = f'node_{node_id}'
|
| 154 |
+
nodes.append({
|
| 155 |
+
'id': f'node_{node_id}',
|
| 156 |
+
'label': concept,
|
| 157 |
+
'level': 1,
|
| 158 |
+
'type': 'main'
|
| 159 |
+
})
|
| 160 |
+
concept_node_id = f'node_{node_id}'
|
| 161 |
+
node_id += 1
|
| 162 |
+
|
| 163 |
+
# Add core component nodes
|
| 164 |
+
for component in structure.get('core_components', []):
|
| 165 |
+
node_map[component] = f'node_{node_id}'
|
| 166 |
+
nodes.append({
|
| 167 |
+
'id': f'node_{node_id}',
|
| 168 |
+
'label': component,
|
| 169 |
+
'level': 2,
|
| 170 |
+
'type': 'component'
|
| 171 |
+
})
|
| 172 |
+
node_id += 1
|
| 173 |
+
|
| 174 |
+
# Add edges from relationships
|
| 175 |
+
for rel in structure.get('relationships', []):
|
| 176 |
+
if len(rel) >= 3:
|
| 177 |
+
source_key, target_key, rel_type = rel[0], rel[1], rel[2]
|
| 178 |
+
source_id = node_map.get(source_key, node_map.get(concept))
|
| 179 |
+
target_id = node_map.get(target_key, concept_node_id)
|
| 180 |
+
|
| 181 |
+
edges.append({
|
| 182 |
+
'source': source_id,
|
| 183 |
+
'target': target_id,
|
| 184 |
+
'relationship': rel_type
|
| 185 |
+
})
|
| 186 |
+
|
| 187 |
+
# Add default prerequisite edges if none exist
|
| 188 |
+
if not edges:
|
| 189 |
+
for prereq in structure.get('prerequisites', []):
|
| 190 |
+
edges.append({
|
| 191 |
+
'source': node_map[prereq],
|
| 192 |
+
'target': concept_node_id,
|
| 193 |
+
'relationship': 'prerequisite'
|
| 194 |
+
})
|
| 195 |
+
|
| 196 |
+
return {
|
| 197 |
+
'nodes': nodes,
|
| 198 |
+
'edges': edges,
|
| 199 |
+
'concept': concept
|
| 200 |
+
}
|
analysis/scorer.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Understanding Scorer Module
|
| 3 |
+
Calculates final understanding scores from analysis results
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Optional
|
| 7 |
+
|
| 8 |
+
class UnderstandingScorer:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
# Scoring weights
|
| 11 |
+
self.weights = {
|
| 12 |
+
'consistency': 0.25,
|
| 13 |
+
'coverage': 0.35,
|
| 14 |
+
'stability': 0.25,
|
| 15 |
+
'assumptions': 0.15
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
def calculate_scores(
|
| 19 |
+
self,
|
| 20 |
+
consistency_result: Dict,
|
| 21 |
+
coverage_result: Dict,
|
| 22 |
+
stability_result: Optional[Dict]
|
| 23 |
+
) -> Dict:
|
| 24 |
+
"""
|
| 25 |
+
Calculate multi-dimensional understanding scores
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
{
|
| 29 |
+
'overall': float (0-100),
|
| 30 |
+
'consistency': float (0-100),
|
| 31 |
+
'coverage': float (0-100),
|
| 32 |
+
'stability': float (0-100),
|
| 33 |
+
'assumptions': float (0-100)
|
| 34 |
+
}
|
| 35 |
+
"""
|
| 36 |
+
# Extract individual scores
|
| 37 |
+
consistency_score = consistency_result.get('consistency_score', 0)
|
| 38 |
+
coverage_score = coverage_result.get('coverage_score', 0)
|
| 39 |
+
stability_score = stability_result.get('stability_score', 100) if stability_result else 100
|
| 40 |
+
|
| 41 |
+
# Calculate assumption completeness score
|
| 42 |
+
# Based on whether key assumptions are made explicit
|
| 43 |
+
assumptions_score = self._calculate_assumption_score(
|
| 44 |
+
coverage_result=coverage_result,
|
| 45 |
+
consistency_result=consistency_result
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Calculate weighted overall score
|
| 49 |
+
overall_score = (
|
| 50 |
+
consistency_score * self.weights['consistency'] +
|
| 51 |
+
coverage_score * self.weights['coverage'] +
|
| 52 |
+
stability_score * self.weights['stability'] +
|
| 53 |
+
assumptions_score * self.weights['assumptions']
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
'overall': round(overall_score, 1),
|
| 58 |
+
'consistency': round(consistency_score, 1),
|
| 59 |
+
'coverage': round(coverage_score, 1),
|
| 60 |
+
'stability': round(stability_score, 1),
|
| 61 |
+
'assumptions': round(assumptions_score, 1)
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
def _calculate_assumption_score(
|
| 65 |
+
self,
|
| 66 |
+
coverage_result: Dict,
|
| 67 |
+
consistency_result: Dict
|
| 68 |
+
) -> float:
|
| 69 |
+
"""
|
| 70 |
+
Calculate assumption completeness score
|
| 71 |
+
|
| 72 |
+
High score = explicit about assumptions, boundary conditions, limitations
|
| 73 |
+
Low score = makes implicit assumptions without stating them
|
| 74 |
+
"""
|
| 75 |
+
# Heuristic: if there are no contradictions and good coverage,
|
| 76 |
+
# assumptions are likely being handled well
|
| 77 |
+
|
| 78 |
+
consistency_score = consistency_result.get('consistency_score', 0)
|
| 79 |
+
coverage_score = coverage_result.get('coverage_score', 0)
|
| 80 |
+
|
| 81 |
+
# Missing concepts indicate unstated assumptions
|
| 82 |
+
missing_count = len(coverage_result.get('missing_concepts', []))
|
| 83 |
+
|
| 84 |
+
# Contradictions indicate conflicting implicit assumptions
|
| 85 |
+
contradiction_count = len(consistency_result.get('contradictions', []))
|
| 86 |
+
|
| 87 |
+
# Base score on coverage and consistency
|
| 88 |
+
base_score = (consistency_score + coverage_score) / 2
|
| 89 |
+
|
| 90 |
+
# Penalize for missing concepts (unstated prerequisites)
|
| 91 |
+
missing_penalty = min(30, missing_count * 10)
|
| 92 |
+
|
| 93 |
+
# Penalize for contradictions (conflicting assumptions)
|
| 94 |
+
contradiction_penalty = min(20, contradiction_count * 15)
|
| 95 |
+
|
| 96 |
+
final_score = base_score - missing_penalty - contradiction_penalty
|
| 97 |
+
|
| 98 |
+
return max(0, min(100, final_score))
|
analysis/stability_tester.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stability Tester Module
|
| 3 |
+
Tests if understanding holds under reformulation and stress testing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict, Optional
|
| 7 |
+
import os
|
| 8 |
+
import requests
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
+
|
| 12 |
+
class StabilityTester:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 15 |
+
self.hf_api_key = os.getenv('HUGGINGFACE_API_KEY')
|
| 16 |
+
self.llm_endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
|
| 17 |
+
self._ready = True
|
| 18 |
+
|
| 19 |
+
def is_ready(self) -> bool:
|
| 20 |
+
return self._ready
|
| 21 |
+
|
| 22 |
+
async def test_stability(
|
| 23 |
+
self,
|
| 24 |
+
concept: str,
|
| 25 |
+
original_explanation: str,
|
| 26 |
+
claims: List[Dict]
|
| 27 |
+
) -> Dict:
|
| 28 |
+
"""
|
| 29 |
+
Test explanation stability through reformulation
|
| 30 |
+
|
| 31 |
+
Strategy:
|
| 32 |
+
1. Generate re-prompts asking user to explain differently
|
| 33 |
+
2. Simulate alternative explanations (or use original for drift)
|
| 34 |
+
3. Measure semantic drift from original
|
| 35 |
+
4. Identify claims that become unclear/contradictory
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
{
|
| 39 |
+
'stability_score': float (0-100),
|
| 40 |
+
'drift_scores': Dict[str, float],
|
| 41 |
+
'unstable_claims': List[Dict],
|
| 42 |
+
'stress_test_results': List[Dict]
|
| 43 |
+
}
|
| 44 |
+
"""
|
| 45 |
+
# Generate stress test prompts
|
| 46 |
+
stress_prompts = self._generate_stress_prompts(concept)
|
| 47 |
+
|
| 48 |
+
# For demo, analyze stability of original explanation
|
| 49 |
+
# In production, would actually re-prompt user or use LLM to generate alternatives
|
| 50 |
+
original_embedding = self.embedding_model.encode(original_explanation)
|
| 51 |
+
|
| 52 |
+
# Test claim stability
|
| 53 |
+
unstable_claims = []
|
| 54 |
+
claim_drift_scores = {}
|
| 55 |
+
|
| 56 |
+
for claim in claims:
|
| 57 |
+
# Check if claim relies on other claims
|
| 58 |
+
stability = await self._test_claim_stability(
|
| 59 |
+
claim=claim,
|
| 60 |
+
concept=concept,
|
| 61 |
+
all_claims=claims
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
claim_drift_scores[claim['id']] = stability['drift_score']
|
| 65 |
+
|
| 66 |
+
if stability['is_unstable']:
|
| 67 |
+
unstable_claims.append({
|
| 68 |
+
'claim': claim['text'],
|
| 69 |
+
'reason': stability['reason'],
|
| 70 |
+
'drift_score': stability['drift_score']
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
# Calculate overall stability score
|
| 74 |
+
avg_drift = np.mean(list(claim_drift_scores.values())) if claim_drift_scores else 0.0
|
| 75 |
+
stability_score = max(0, 100 - (avg_drift * 100))
|
| 76 |
+
|
| 77 |
+
return {
|
| 78 |
+
'stability_score': stability_score,
|
| 79 |
+
'drift_scores': claim_drift_scores,
|
| 80 |
+
'unstable_claims': unstable_claims[:3], # Top 3
|
| 81 |
+
'stress_test_results': [
|
| 82 |
+
{
|
| 83 |
+
'prompt': prompt,
|
| 84 |
+
'passes': len(unstable_claims) == 0
|
| 85 |
+
}
|
| 86 |
+
for prompt in stress_prompts[:2]
|
| 87 |
+
]
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
def _generate_stress_prompts(self, concept: str) -> List[str]:
|
| 91 |
+
"""Generate stress test prompts"""
|
| 92 |
+
return [
|
| 93 |
+
f"Explain {concept} in a different way",
|
| 94 |
+
f"What would happen if {concept} didn't exist?",
|
| 95 |
+
f"Explain {concept} to a 10-year-old",
|
| 96 |
+
f"What are the limits or boundary conditions of {concept}?"
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
async def _test_claim_stability(
|
| 100 |
+
self,
|
| 101 |
+
claim: Dict,
|
| 102 |
+
concept: str,
|
| 103 |
+
all_claims: List[Dict]
|
| 104 |
+
) -> Dict:
|
| 105 |
+
"""Test if a single claim is stable"""
|
| 106 |
+
# Heuristic: claims that are very short or vague are unstable
|
| 107 |
+
claim_text = claim['text']
|
| 108 |
+
word_count = len(claim_text.split())
|
| 109 |
+
|
| 110 |
+
# Very short claims (<5 words) are often unstable
|
| 111 |
+
if word_count < 5:
|
| 112 |
+
return {
|
| 113 |
+
'is_unstable': True,
|
| 114 |
+
'reason': 'Claim is too brief to demonstrate understanding',
|
| 115 |
+
'drift_score': 0.6
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Check for vague language
|
| 119 |
+
vague_terms = ['thing', 'stuff', 'kind of', 'sort of', 'basically', 'just', 'simply']
|
| 120 |
+
vague_count = sum(1 for term in vague_terms if term in claim_text.lower())
|
| 121 |
+
|
| 122 |
+
if vague_count >= 2:
|
| 123 |
+
return {
|
| 124 |
+
'is_unstable': True,
|
| 125 |
+
'reason': 'Contains vague language suggesting surface understanding',
|
| 126 |
+
'drift_score': 0.5
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# Check if claim is standalone or depends on others
|
| 130 |
+
# Claims that reference "this" or "that" without clear antecedent are unstable
|
| 131 |
+
unclear_refs = ['this', 'that', 'it', 'these', 'those']
|
| 132 |
+
has_unclear_ref = any(claim_text.lower().startswith(ref + ' ') for ref in unclear_refs)
|
| 133 |
+
|
| 134 |
+
if has_unclear_ref and len(all_claims) > 1:
|
| 135 |
+
return {
|
| 136 |
+
'is_unstable': True,
|
| 137 |
+
'reason': 'Claim has unclear references and may not stand alone',
|
| 138 |
+
'drift_score': 0.4
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
# Claim appears stable
|
| 142 |
+
return {
|
| 143 |
+
'is_unstable': False,
|
| 144 |
+
'reason': 'Claim appears well-formed',
|
| 145 |
+
'drift_score': 0.1
|
| 146 |
+
}
|
main.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RealityCheck AI - Backend API
|
| 3 |
+
FastAPI server for analyzing how well someone understands a concept
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
import os
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
|
| 13 |
+
from analysis.claim_extractor import ClaimExtractor
|
| 14 |
+
from analysis.graph_generator import ConceptGraphGenerator
|
| 15 |
+
from analysis.consistency_checker import ConsistencyChecker
|
| 16 |
+
from analysis.coverage_analyzer import CoverageAnalyzer
|
| 17 |
+
from analysis.stability_tester import StabilityTester
|
| 18 |
+
from analysis.scorer import UnderstandingScorer
|
| 19 |
+
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
app = FastAPI(
|
| 23 |
+
title="RealityCheck AI API",
|
| 24 |
+
description="Understanding analysis engine",
|
| 25 |
+
version="1.0.0"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# CORS - TODO: lock this down for production
|
| 29 |
+
app.add_middleware(
|
| 30 |
+
CORSMiddleware,
|
| 31 |
+
allow_origins=["*"], # TODO: change this before deploying
|
| 32 |
+
allow_credentials=True,
|
| 33 |
+
allow_methods=["*"],
|
| 34 |
+
allow_headers=["*"],
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Initialize analysis pipeline components
|
| 38 |
+
claim_extractor = ClaimExtractor()
|
| 39 |
+
graph_generator = ConceptGraphGenerator()
|
| 40 |
+
consistency_checker = ConsistencyChecker()
|
| 41 |
+
coverage_analyzer = CoverageAnalyzer()
|
| 42 |
+
stability_tester = StabilityTester()
|
| 43 |
+
scorer = UnderstandingScorer()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class AnalysisRequest(BaseModel):
|
| 47 |
+
concept: str
|
| 48 |
+
explanation: str
|
| 49 |
+
test_stability: Optional[bool] = True
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ConceptNode(BaseModel):
|
| 53 |
+
id: str
|
| 54 |
+
label: str
|
| 55 |
+
status: str # 'covered', 'weak', 'missing'
|
| 56 |
+
user_quote: Optional[str] = None
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class ConceptEdge(BaseModel):
|
| 60 |
+
source: str
|
| 61 |
+
target: str
|
| 62 |
+
relationship: str # 'prerequisite', 'causal', 'related'
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class ConceptGraph(BaseModel):
|
| 66 |
+
nodes: List[ConceptNode]
|
| 67 |
+
edges: List[ConceptEdge]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class ScoreBreakdown(BaseModel):
|
| 71 |
+
consistency: float
|
| 72 |
+
coverage: float
|
| 73 |
+
stability: float
|
| 74 |
+
assumption_completeness: float
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class FeedbackItem(BaseModel):
|
| 78 |
+
type: str # 'missing_concept', 'contradiction', 'weak_link'
|
| 79 |
+
severity: str # 'high', 'medium', 'low'
|
| 80 |
+
message: str
|
| 81 |
+
suggestion: str
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class AnalysisResponse(BaseModel):
|
| 85 |
+
overall_score: float
|
| 86 |
+
score_breakdown: ScoreBreakdown
|
| 87 |
+
concept_graph: ConceptGraph
|
| 88 |
+
feedback: List[FeedbackItem]
|
| 89 |
+
confidence_mismatch_warning: Optional[str] = None
|
| 90 |
+
explanation_stability: Optional[Dict[str, float]] = None
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.get("/")
|
| 94 |
+
async def root():
|
| 95 |
+
"""Health check endpoint"""
|
| 96 |
+
return {
|
| 97 |
+
"message": "RealityCheck AI API",
|
| 98 |
+
"status": "operational",
|
| 99 |
+
"version": "1.0.0"
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@app.get("/health")
|
| 104 |
+
async def health_check():
|
| 105 |
+
"""Detailed health check"""
|
| 106 |
+
return {
|
| 107 |
+
"status": "healthy",
|
| 108 |
+
"models_loaded": {
|
| 109 |
+
"embeddings": claim_extractor.is_ready(),
|
| 110 |
+
"nli": consistency_checker.is_ready(),
|
| 111 |
+
"llm": graph_generator.is_ready()
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@app.post("/analyze", response_model=AnalysisResponse)
|
| 117 |
+
async def analyze_understanding(request: AnalysisRequest):
|
| 118 |
+
"""
|
| 119 |
+
Main endpoint: Analyze user's conceptual understanding
|
| 120 |
+
|
| 121 |
+
This endpoint orchestrates the entire analysis pipeline:
|
| 122 |
+
1. Extract claims from explanation
|
| 123 |
+
2. Generate canonical concept graph
|
| 124 |
+
3. Check logical consistency
|
| 125 |
+
4. Analyze concept coverage
|
| 126 |
+
5. Test explanation stability
|
| 127 |
+
6. Calculate understanding scores
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
# Step 1: Extract atomic claims from user explanation
|
| 131 |
+
claims = await claim_extractor.extract_claims(request.explanation)
|
| 132 |
+
|
| 133 |
+
# Step 2: Generate canonical concept graph for the concept
|
| 134 |
+
canonical_graph = await graph_generator.generate_graph(request.concept)
|
| 135 |
+
|
| 136 |
+
# Step 3: Check logical consistency between claims
|
| 137 |
+
consistency_result = await consistency_checker.check_consistency(claims)
|
| 138 |
+
|
| 139 |
+
# Step 4: Analyze concept coverage
|
| 140 |
+
coverage_result = await coverage_analyzer.analyze_coverage(
|
| 141 |
+
user_claims=claims,
|
| 142 |
+
canonical_graph=canonical_graph,
|
| 143 |
+
explanation=request.explanation
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Step 5: Test stability (if requested)
|
| 147 |
+
stability_result = None
|
| 148 |
+
if request.test_stability:
|
| 149 |
+
stability_result = await stability_tester.test_stability(
|
| 150 |
+
concept=request.concept,
|
| 151 |
+
original_explanation=request.explanation,
|
| 152 |
+
claims=claims
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Step 6: Calculate overall understanding score
|
| 156 |
+
scores = scorer.calculate_scores(
|
| 157 |
+
consistency_result=consistency_result,
|
| 158 |
+
coverage_result=coverage_result,
|
| 159 |
+
stability_result=stability_result
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Build concept graph with user coverage
|
| 163 |
+
concept_graph = _build_concept_graph(
|
| 164 |
+
canonical_graph=canonical_graph,
|
| 165 |
+
coverage_result=coverage_result
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Generate targeted feedback
|
| 169 |
+
feedback = _generate_feedback(
|
| 170 |
+
consistency_result=consistency_result,
|
| 171 |
+
coverage_result=coverage_result,
|
| 172 |
+
stability_result=stability_result
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
# Detect confidence-understanding mismatch
|
| 176 |
+
confidence_warning = _check_confidence_mismatch(
|
| 177 |
+
explanation=request.explanation,
|
| 178 |
+
overall_score=scores['overall']
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
return AnalysisResponse(
|
| 182 |
+
overall_score=scores['overall'],
|
| 183 |
+
score_breakdown=ScoreBreakdown(
|
| 184 |
+
consistency=scores['consistency'],
|
| 185 |
+
coverage=scores['coverage'],
|
| 186 |
+
stability=scores['stability'],
|
| 187 |
+
assumption_completeness=scores['assumptions']
|
| 188 |
+
),
|
| 189 |
+
concept_graph=concept_graph,
|
| 190 |
+
feedback=feedback,
|
| 191 |
+
confidence_mismatch_warning=confidence_warning,
|
| 192 |
+
explanation_stability=stability_result.get('drift_scores') if stability_result else None
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
@app.get("/concepts")
|
| 200 |
+
async def get_sample_concepts():
|
| 201 |
+
"""Get list of sample concepts for testing"""
|
| 202 |
+
return {
|
| 203 |
+
"concepts": [
|
| 204 |
+
{
|
| 205 |
+
"name": "Entropy (Physics)",
|
| 206 |
+
"category": "Physics",
|
| 207 |
+
"difficulty": "intermediate"
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"name": "Neural Networks",
|
| 211 |
+
"category": "Computer Science",
|
| 212 |
+
"difficulty": "intermediate"
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"name": "Photosynthesis",
|
| 216 |
+
"category": "Biology",
|
| 217 |
+
"difficulty": "beginner"
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"name": "Supply and Demand",
|
| 221 |
+
"category": "Economics",
|
| 222 |
+
"difficulty": "beginner"
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"name": "Recursion",
|
| 226 |
+
"category": "Computer Science",
|
| 227 |
+
"difficulty": "intermediate"
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"name": "Natural Selection",
|
| 231 |
+
"category": "Biology",
|
| 232 |
+
"difficulty": "intermediate"
|
| 233 |
+
}
|
| 234 |
+
]
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def _build_concept_graph(canonical_graph: Dict, coverage_result: Dict) -> ConceptGraph:
|
| 239 |
+
"""Build concept graph with user coverage information"""
|
| 240 |
+
nodes = []
|
| 241 |
+
for node in canonical_graph['nodes']:
|
| 242 |
+
node_id = node['id']
|
| 243 |
+
coverage_info = coverage_result.get('node_coverage', {}).get(node_id, {})
|
| 244 |
+
|
| 245 |
+
nodes.append(ConceptNode(
|
| 246 |
+
id=node_id,
|
| 247 |
+
label=node['label'],
|
| 248 |
+
status=coverage_info.get('status', 'missing'),
|
| 249 |
+
user_quote=coverage_info.get('user_quote')
|
| 250 |
+
))
|
| 251 |
+
|
| 252 |
+
edges = [
|
| 253 |
+
ConceptEdge(
|
| 254 |
+
source=edge['source'],
|
| 255 |
+
target=edge['target'],
|
| 256 |
+
relationship=edge['relationship']
|
| 257 |
+
)
|
| 258 |
+
for edge in canonical_graph['edges']
|
| 259 |
+
]
|
| 260 |
+
|
| 261 |
+
return ConceptGraph(nodes=nodes, edges=edges)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _generate_feedback(
|
| 265 |
+
consistency_result: Dict,
|
| 266 |
+
coverage_result: Dict,
|
| 267 |
+
stability_result: Optional[Dict]
|
| 268 |
+
) -> List[FeedbackItem]:
|
| 269 |
+
"""Generate targeted feedback items"""
|
| 270 |
+
feedback = []
|
| 271 |
+
|
| 272 |
+
# Consistency issues
|
| 273 |
+
for contradiction in consistency_result.get('contradictions', []):
|
| 274 |
+
feedback.append(FeedbackItem(
|
| 275 |
+
type='contradiction',
|
| 276 |
+
severity='high',
|
| 277 |
+
message=f"Contradiction detected between: '{contradiction['claim1']}' and '{contradiction['claim2']}'",
|
| 278 |
+
suggestion=contradiction.get('suggestion', 'Review these claims for logical consistency')
|
| 279 |
+
))
|
| 280 |
+
|
| 281 |
+
# Missing concepts
|
| 282 |
+
for missing in coverage_result.get('missing_concepts', []):
|
| 283 |
+
feedback.append(FeedbackItem(
|
| 284 |
+
type='missing_concept',
|
| 285 |
+
severity=missing.get('severity', 'medium'),
|
| 286 |
+
message=f"Missing prerequisite concept: {missing['concept']}",
|
| 287 |
+
suggestion=f"Consider explaining: {missing.get('description', '')}"
|
| 288 |
+
))
|
| 289 |
+
|
| 290 |
+
# Weak links
|
| 291 |
+
for weak in coverage_result.get('weak_links', []):
|
| 292 |
+
feedback.append(FeedbackItem(
|
| 293 |
+
type='weak_link',
|
| 294 |
+
severity='low',
|
| 295 |
+
message=f"Weak explanation of: {weak['concept']}",
|
| 296 |
+
suggestion=weak.get('suggestion', 'Provide more detail')
|
| 297 |
+
))
|
| 298 |
+
|
| 299 |
+
# Stability issues
|
| 300 |
+
if stability_result and stability_result.get('unstable_claims'):
|
| 301 |
+
for unstable in stability_result['unstable_claims']:
|
| 302 |
+
feedback.append(FeedbackItem(
|
| 303 |
+
type='instability',
|
| 304 |
+
severity='medium',
|
| 305 |
+
message=f"Explanation becomes unclear when reformulated: {unstable['claim']}",
|
| 306 |
+
suggestion="This may indicate surface-level understanding. Try explaining the underlying mechanism."
|
| 307 |
+
))
|
| 308 |
+
|
| 309 |
+
return feedback
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _check_confidence_mismatch(explanation: str, overall_score: float) -> Optional[str]:
|
| 313 |
+
"""Detect when explanation sounds confident but scores low"""
|
| 314 |
+
# Simple heuristic: check for confident language markers
|
| 315 |
+
confident_markers = [
|
| 316 |
+
'obviously', 'clearly', 'of course', 'everyone knows',
|
| 317 |
+
'it is evident', 'undoubtedly', 'certainly', 'definitely'
|
| 318 |
+
]
|
| 319 |
+
|
| 320 |
+
explanation_lower = explanation.lower()
|
| 321 |
+
confidence_indicators = sum(1 for marker in confident_markers if marker in explanation_lower)
|
| 322 |
+
|
| 323 |
+
# If high confidence language but low score, warn
|
| 324 |
+
if confidence_indicators >= 2 and overall_score < 60:
|
| 325 |
+
return (
|
| 326 |
+
"⚠️ Confidence-Understanding Mismatch Detected: "
|
| 327 |
+
"Your explanation uses confident language, but analysis suggests potential gaps. "
|
| 328 |
+
"This is common when we're familiar with terminology but haven't fully internalized the concepts."
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
return None
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
if __name__ == "__main__":
|
| 335 |
+
import uvicorn
|
| 336 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.100.0
|
| 2 |
+
uvicorn[standard]>=0.25.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
python-multipart>=0.0.6
|
| 5 |
+
sentence-transformers>=2.0.0
|
| 6 |
+
transformers>=4.30.0
|
| 7 |
+
torch>=2.0.0
|
| 8 |
+
numpy>=1.24.0
|
| 9 |
+
networkx>=3.0.0
|
| 10 |
+
python-dotenv>=1.0.0
|
| 11 |
+
huggingface-hub>=0.16.0
|
| 12 |
+
requests>=2.31.0
|
| 13 |
+
scikit-learn>=1.3.0
|