Upload 21 files
Browse files- .gitignore +122 -0
- README.md +148 -20
- app.py +28 -0
- config.py +12 -0
- mechanisms/__init__.py +15 -0
- mechanisms/__pycache__/__init__.cpython-311.pyc +0 -0
- mechanisms/__pycache__/baseline.cpython-311.pyc +0 -0
- mechanisms/__pycache__/caching.cpython-311.pyc +0 -0
- mechanisms/__pycache__/consensus.cpython-311.pyc +0 -0
- mechanisms/__pycache__/constraint.cpython-311.pyc +0 -0
- mechanisms/__pycache__/historical.cpython-311.pyc +0 -0
- mechanisms/__pycache__/predictability.cpython-311.pyc +0 -0
- mechanisms/baseline.py +4 -0
- mechanisms/caching.py +11 -0
- mechanisms/consensus.py +33 -0
- mechanisms/constraint.py +21 -0
- mechanisms/historical.py +16 -0
- mechanisms/predictability.py +16 -0
- requirements.txt +5 -3
- utils/__pycache__/llm_utils.cpython-311.pyc +0 -0
- utils/llm_utils.py +12 -0
.gitignore
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------------------------
|
| 2 |
+
# Python
|
| 3 |
+
# ------------------------------
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*$py.class
|
| 7 |
+
|
| 8 |
+
# C extensions
|
| 9 |
+
*.so
|
| 10 |
+
*.pyd
|
| 11 |
+
*.dll
|
| 12 |
+
|
| 13 |
+
# ------------------------------
|
| 14 |
+
# Environments
|
| 15 |
+
# ------------------------------
|
| 16 |
+
.venv/
|
| 17 |
+
venv/
|
| 18 |
+
env/
|
| 19 |
+
ENV/
|
| 20 |
+
.venv*/
|
| 21 |
+
venv*/
|
| 22 |
+
env*/
|
| 23 |
+
ENV*/
|
| 24 |
+
.python-version
|
| 25 |
+
|
| 26 |
+
# ------------------------------
|
| 27 |
+
# Distribution / packaging
|
| 28 |
+
# ------------------------------
|
| 29 |
+
.Python
|
| 30 |
+
build/
|
| 31 |
+
dist/
|
| 32 |
+
downloads/
|
| 33 |
+
eggs/
|
| 34 |
+
.eggs/
|
| 35 |
+
sdist/
|
| 36 |
+
wheels/
|
| 37 |
+
share/python-wheels/
|
| 38 |
+
*.egg-info/
|
| 39 |
+
.installed.cfg
|
| 40 |
+
*.egg
|
| 41 |
+
MANIFEST
|
| 42 |
+
pip-wheel-metadata/
|
| 43 |
+
pip-log.txt
|
| 44 |
+
pip-delete-this-directory.txt
|
| 45 |
+
|
| 46 |
+
# ------------------------------
|
| 47 |
+
# Unit test / coverage reports
|
| 48 |
+
# ------------------------------
|
| 49 |
+
htmlcov/
|
| 50 |
+
.tox/
|
| 51 |
+
.nox/
|
| 52 |
+
.coverage
|
| 53 |
+
.coverage.*
|
| 54 |
+
.cache
|
| 55 |
+
nosetests.xml
|
| 56 |
+
coverage.xml
|
| 57 |
+
*.cover
|
| 58 |
+
*.py,cover
|
| 59 |
+
.pytest_cache/
|
| 60 |
+
junit*.xml
|
| 61 |
+
|
| 62 |
+
# ------------------------------
|
| 63 |
+
# Type checkers / linters
|
| 64 |
+
# ------------------------------
|
| 65 |
+
.mypy_cache/
|
| 66 |
+
.dmypy.json
|
| 67 |
+
dmypy.json
|
| 68 |
+
.pyre/
|
| 69 |
+
.pytype/
|
| 70 |
+
.ruff_cache/
|
| 71 |
+
|
| 72 |
+
# ------------------------------
|
| 73 |
+
# PyInstaller
|
| 74 |
+
# ------------------------------
|
| 75 |
+
*.manifest
|
| 76 |
+
*.spec
|
| 77 |
+
|
| 78 |
+
# ------------------------------
|
| 79 |
+
# Jupyter
|
| 80 |
+
# ------------------------------
|
| 81 |
+
.ipynb_checkpoints/
|
| 82 |
+
|
| 83 |
+
# ------------------------------
|
| 84 |
+
# Logs and runtime files
|
| 85 |
+
# ------------------------------
|
| 86 |
+
logs/
|
| 87 |
+
*.log
|
| 88 |
+
*.pid
|
| 89 |
+
*.pid.lock
|
| 90 |
+
|
| 91 |
+
# ------------------------------
|
| 92 |
+
# Local environment variables & secrets
|
| 93 |
+
# ------------------------------
|
| 94 |
+
.env
|
| 95 |
+
.env.*
|
| 96 |
+
!.env.example
|
| 97 |
+
|
| 98 |
+
# ------------------------------
|
| 99 |
+
# Editors / IDEs / Tooling
|
| 100 |
+
# ------------------------------
|
| 101 |
+
.idea/
|
| 102 |
+
*.iml
|
| 103 |
+
.vscode/
|
| 104 |
+
.history/
|
| 105 |
+
.cursor/
|
| 106 |
+
*.code-workspace
|
| 107 |
+
|
| 108 |
+
# ------------------------------
|
| 109 |
+
# OS-specific
|
| 110 |
+
# ------------------------------
|
| 111 |
+
.DS_Store
|
| 112 |
+
Thumbs.db
|
| 113 |
+
ehthumbs.db
|
| 114 |
+
Desktop.ini
|
| 115 |
+
|
| 116 |
+
# ------------------------------
|
| 117 |
+
# Optional local data & temp
|
| 118 |
+
# ------------------------------
|
| 119 |
+
tmp/
|
| 120 |
+
temp/
|
| 121 |
+
data/
|
| 122 |
+
|
README.md
CHANGED
|
@@ -1,20 +1,148 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
---
|
| 14 |
-
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## StableAI – LLM Consistency Demo
|
| 2 |
+
|
| 3 |
+
A small Streamlit app that showcases several practical mechanisms to improve LLM predictability and consistency. It runs locally and calls Groq-hosted models via an OpenAI-compatible API.
|
| 4 |
+
|
| 5 |
+
### What this app demonstrates
|
| 6 |
+
- **Baseline (Raw LLM)**: Direct model call without safeguards.
|
| 7 |
+
- **Caching & Replay**: Deterministic replay for identical prompts in a session.
|
| 8 |
+
- **Historical Consistency**: Reuses prior answers for similar prompts using a fuzzy matcher.
|
| 9 |
+
- **Cross-Model Consensus**: Gathers answers from multiple models and asks a judge model to summarize consensus.
|
| 10 |
+
- **Constraint Validation (Schema)**: Forces JSON output and validates it via Pydantic.
|
| 11 |
+
- **Predictability Index**: Runs the same prompt multiple times and scores similarity between outputs.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## Architecture at a glance
|
| 16 |
+
- `app.py`: Streamlit UI; wires user input to registered mechanisms, exposes buttons for cache/history management.
|
| 17 |
+
- `config.py`: Loads environment, initializes OpenAI-compatible client pointing at Groq (`GROQ_API_KEY`, base URL `https://api.groq.com/openai/v1`).
|
| 18 |
+
- `utils/llm_utils.py`: Thin wrapper `call_model(prompt, model)` and `get_hash(text)` for caching keys.
|
| 19 |
+
- `mechanisms/` (registry in `__init__.py`):
|
| 20 |
+
- `baseline.py`: Simple call to `llama-3.1-8b-instant`.
|
| 21 |
+
- `caching.py`: In-memory cache in `st.session_state.cache` keyed by SHA-256 of the prompt.
|
| 22 |
+
- `historical.py`: Similarity lookup over `st.session_state.history` using `difflib.SequenceMatcher`.
|
| 23 |
+
- `consensus.py`: Calls multiple models, then a judge model to assess/summarize consensus.
|
| 24 |
+
- `constraint.py`: Prompts for strict JSON, validates with a Pydantic model.
|
| 25 |
+
- `predictability.py`: N repeated calls; pairwise similarity to compute a predictability score.
|
| 26 |
+
|
| 27 |
+
Notes:
|
| 28 |
+
- Session state is ephemeral (cleared when the Streamlit session resets).
|
| 29 |
+
- Network calls go through the Groq API using the OpenAI SDK interface.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Prerequisites
|
| 34 |
+
- Python 3.9+
|
| 35 |
+
- A Groq API key (`GROQ_API_KEY`)
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## Local setup
|
| 40 |
+
1) Clone and enter the project directory.
|
| 41 |
+
|
| 42 |
+
2) (Recommended) Create and activate a virtual environment:
|
| 43 |
+
```bash
|
| 44 |
+
python -m venv .venv
|
| 45 |
+
# Windows PowerShell
|
| 46 |
+
. .venv\Scripts\Activate.ps1
|
| 47 |
+
# macOS/Linux
|
| 48 |
+
source .venv/bin/activate
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
3) Install dependencies (no requirements file is included; install directly):
|
| 52 |
+
```bash
|
| 53 |
+
pip install streamlit python-dotenv openai pydantic
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
4) Create a `.env` file in the project root with your key:
|
| 57 |
+
```bash
|
| 58 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## Running the app
|
| 64 |
+
From the project root:
|
| 65 |
+
```bash
|
| 66 |
+
streamlit run app.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
Then open the URL Streamlit prints (typically `http://localhost:8501`).
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
## Using the app
|
| 74 |
+
1) Choose a mechanism from the radio options.
|
| 75 |
+
2) Enter your query.
|
| 76 |
+
3) Click “Ask”.
|
| 77 |
+
4) Use “Clear Cache” / “Clear History” as needed to reset state.
|
| 78 |
+
|
| 79 |
+
Model notes:
|
| 80 |
+
- Default model for most calls is `llama-3.1-8b-instant`.
|
| 81 |
+
- Cross-model consensus uses `openai/gpt-oss-20b` and `llama-3.3-70b-versatile` and judges with `llama-3.1-8b-instant`.
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
## How it works (brief)
|
| 86 |
+
- The UI passes the prompt to a selected mechanism via a registry (`MECHANISMS`).
|
| 87 |
+
- Each mechanism composes a request using `utils/llm_utils.call_model` (OpenAI SDK → Groq endpoint).
|
| 88 |
+
- Some mechanisms store/retrieve answers from `st.session_state` for caching and history.
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Limitations and improvement ideas
|
| 93 |
+
|
| 94 |
+
### Similarity and retrieval
|
| 95 |
+
- Replace `difflib.SequenceMatcher` with **embeddings + cosine similarity**:
|
| 96 |
+
- Use sentence embeddings (e.g., `text-embedding-3-large`, or any Groq-supported embedding model) to encode prompts/answers.
|
| 97 |
+
- Compute cosine similarity for robust semantic matching over historical prompts, not just character overlap.
|
| 98 |
+
- Persist vectors and metadata in a vector store (e.g., FAISS, Chroma, pgvector) for efficient nearest-neighbor search.
|
| 99 |
+
- Benefit: better recall/precision for paraphrases and longer contexts.
|
| 100 |
+
|
| 101 |
+
### Caching and persistence
|
| 102 |
+
- Move from in-memory `st.session_state` to a persistent cache (Redis, SQLite) with TTLs and size limits.
|
| 103 |
+
- Cache by normalized prompt + key parameters (model, temperature, system prompt) to avoid accidental collisions.
|
| 104 |
+
- Add cache warming and background refresh for hot prompts.
|
| 105 |
+
|
| 106 |
+
### Determinism and variance control
|
| 107 |
+
- Expose decoding params (temperature, top_p, seed if supported) in the UI.
|
| 108 |
+
- For predictability scoring, fix seeds where supported to separate model stochasticity from service variance.
|
| 109 |
+
- Compute additional stability metrics (e.g., ROUGE-L, BERTScore) between runs.
|
| 110 |
+
|
| 111 |
+
### Robust output contracts
|
| 112 |
+
- Expand `constraint.py` to support multiple schemas and strict parsing with function calling / JSON mode if available.
|
| 113 |
+
- Add retry/repair loop when JSON validation fails (ask the model to fix output).
|
| 114 |
+
|
| 115 |
+
### Evaluation and CI
|
| 116 |
+
- Add an evaluation harness with a small prompt-answer dataset:
|
| 117 |
+
- Track accuracy/consistency per mechanism over time.
|
| 118 |
+
- Save run artifacts (inputs, outputs, scores) for regression checks.
|
| 119 |
+
- Provide unit tests for key utilities and mechanisms; mock network calls.
|
| 120 |
+
|
| 121 |
+
### Observability
|
| 122 |
+
- Add logging/telemetry for latency, token usage, and error rates.
|
| 123 |
+
- Surface metrics in the UI (per mechanism) to understand trade-offs.
|
| 124 |
+
|
| 125 |
+
### UX improvements
|
| 126 |
+
- Show per-mechanism explanations next to results.
|
| 127 |
+
- Allow exporting session cache/history and reloading it.
|
| 128 |
+
- Provide advanced settings accordion (models, decoding params, thresholds).
|
| 129 |
+
|
| 130 |
+
---
|
| 131 |
+
|
| 132 |
+
## Example: swapping fuzzy matcher for embeddings
|
| 133 |
+
High-level steps to upgrade `historical.py`:
|
| 134 |
+
1) Add an embedding helper (e.g., `get_embedding(text) -> List[float]`).
|
| 135 |
+
2) On first-seen prompts, store `{prompt, answer, prompt_embedding}` in a persistent store.
|
| 136 |
+
3) On new queries, compute its embedding and run top-k nearest neighbor search by cosine similarity.
|
| 137 |
+
4) If similarity > threshold (e.g., 0.85), return the historical answer; otherwise call the model and insert the new row.
|
| 138 |
+
|
| 139 |
+
This yields more robust reuse across paraphrases and longer prompts, compared to `difflib`.
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## Troubleshooting
|
| 144 |
+
- 401/403 errors: verify `GROQ_API_KEY` and `.env` loading; confirm base URL matches Groq’s OpenAI-compatible endpoint.
|
| 145 |
+
- Streamlit can reuse state across reruns; use the provided buttons to clear cache/history.
|
| 146 |
+
- If models change or rate limits apply, consensus may show partial errors; the UI surfaces them inline.
|
| 147 |
+
|
| 148 |
+
|
app.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from mechanisms import MECHANISMS
|
| 3 |
+
|
| 4 |
+
if "cache" not in st.session_state:
|
| 5 |
+
st.session_state.cache = {}
|
| 6 |
+
if "history" not in st.session_state:
|
| 7 |
+
st.session_state.history = {}
|
| 8 |
+
|
| 9 |
+
st.title("LLM Consistency Demo")
|
| 10 |
+
st.markdown("Explore mechanisms to improve LLM predictability & consistency.")
|
| 11 |
+
|
| 12 |
+
mode = st.radio("Choose Mechanism:", list(MECHANISMS.keys()))
|
| 13 |
+
user_prompt = st.text_input("Enter your query:")
|
| 14 |
+
|
| 15 |
+
if st.button("Ask"):
|
| 16 |
+
if user_prompt.strip():
|
| 17 |
+
answer = MECHANISMS[mode](user_prompt)
|
| 18 |
+
|
| 19 |
+
st.markdown("### Response:")
|
| 20 |
+
st.write(answer)
|
| 21 |
+
|
| 22 |
+
if st.button("Clear Cache"):
|
| 23 |
+
st.session_state.cache.clear()
|
| 24 |
+
st.success("Cache cleared!")
|
| 25 |
+
|
| 26 |
+
if st.button("Clear History"):
|
| 27 |
+
st.session_state.history.clear()
|
| 28 |
+
st.success("History cleared!")
|
config.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
|
| 5 |
+
load_dotenv(override=True)
|
| 6 |
+
|
| 7 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 8 |
+
|
| 9 |
+
client = OpenAI(
|
| 10 |
+
api_key=GROQ_API_KEY,
|
| 11 |
+
base_url="https://api.groq.com/openai/v1"
|
| 12 |
+
)
|
mechanisms/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .baseline import baseline
|
| 2 |
+
from .caching import caching
|
| 3 |
+
from .historical import historical
|
| 4 |
+
from .consensus import cross_model
|
| 5 |
+
from .constraint import constraint
|
| 6 |
+
from .predictability import predictability
|
| 7 |
+
|
| 8 |
+
MECHANISMS = {
|
| 9 |
+
"Baseline (Raw LLM)": baseline,
|
| 10 |
+
"Caching & Replay": caching,
|
| 11 |
+
"Historical Consistency": historical,
|
| 12 |
+
"Cross-Model Consensus": cross_model,
|
| 13 |
+
"Constraint Validation (Schema)": constraint,
|
| 14 |
+
"Predictability Index": predictability,
|
| 15 |
+
}
|
mechanisms/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (656 Bytes). View file
|
|
|
mechanisms/__pycache__/baseline.cpython-311.pyc
ADDED
|
Binary file (498 Bytes). View file
|
|
|
mechanisms/__pycache__/caching.cpython-311.pyc
ADDED
|
Binary file (835 Bytes). View file
|
|
|
mechanisms/__pycache__/consensus.cpython-311.pyc
ADDED
|
Binary file (2.23 kB). View file
|
|
|
mechanisms/__pycache__/constraint.cpython-311.pyc
ADDED
|
Binary file (1.49 kB). View file
|
|
|
mechanisms/__pycache__/historical.cpython-311.pyc
ADDED
|
Binary file (1.2 kB). View file
|
|
|
mechanisms/__pycache__/predictability.cpython-311.pyc
ADDED
|
Binary file (1.93 kB). View file
|
|
|
mechanisms/baseline.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.llm_utils import call_model
|
| 2 |
+
|
| 3 |
+
def baseline(prompt: str) -> str:
|
| 4 |
+
return call_model(prompt, "llama-3.1-8b-instant") + " \n⚡ (fresh answer)"
|
mechanisms/caching.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.llm_utils import call_model, get_hash
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
def caching(prompt: str) -> str:
|
| 5 |
+
key = get_hash(prompt)
|
| 6 |
+
if key in st.session_state.cache:
|
| 7 |
+
return st.session_state.cache[key] + " \n✅ (from cache)"
|
| 8 |
+
else:
|
| 9 |
+
ans = call_model(prompt)
|
| 10 |
+
st.session_state.cache[key] = ans
|
| 11 |
+
return ans + " \n💡 (new answer cached)"
|
mechanisms/consensus.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.llm_utils import call_model
|
| 2 |
+
|
| 3 |
+
def judge_consensus(prompt: str, responses: dict, judge_model="llama-3.1-8b-instant") -> str:
|
| 4 |
+
judge_prompt = f"""
|
| 5 |
+
You are a judge LLM. The user asked: "{prompt}"
|
| 6 |
+
|
| 7 |
+
Here are the responses from different models:
|
| 8 |
+
{chr(10).join([f"- {m}: {ans}" for m, ans in responses.items()])}
|
| 9 |
+
|
| 10 |
+
Task:
|
| 11 |
+
1. Decide if the answers are essentially saying the same thing.
|
| 12 |
+
2. If yes, summarize the consensus in 2-3 lines.
|
| 13 |
+
3. If no, state clearly that there is no consensus and why.
|
| 14 |
+
|
| 15 |
+
Answer format:
|
| 16 |
+
Consensus: <your summary OR "No consensus">
|
| 17 |
+
"""
|
| 18 |
+
return call_model(judge_prompt, judge_model)
|
| 19 |
+
|
| 20 |
+
def cross_model(prompt: str):
|
| 21 |
+
models = ["openai/gpt-oss-20b", "llama-3.3-70b-versatile"]
|
| 22 |
+
responses = {}
|
| 23 |
+
for m in models:
|
| 24 |
+
try:
|
| 25 |
+
responses[m] = call_model(prompt, m)
|
| 26 |
+
except Exception as e:
|
| 27 |
+
responses[m] = f"⚠️ Error: {str(e)}"
|
| 28 |
+
consensus = judge_consensus(prompt, responses)
|
| 29 |
+
out = "### Model Responses:\n"
|
| 30 |
+
for m, ans in responses.items():
|
| 31 |
+
out += f"- **{m}**: {ans}\n\n"
|
| 32 |
+
out += "\n### Judge Decision:\n" + consensus
|
| 33 |
+
return out
|
mechanisms/constraint.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, ValidationError
|
| 2 |
+
from utils.llm_utils import call_model
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
class AnswerSchema(BaseModel):
|
| 6 |
+
answer: str
|
| 7 |
+
|
| 8 |
+
def constraint(prompt: str) -> str:
|
| 9 |
+
schema_instruction = f"""
|
| 10 |
+
Respond strictly in JSON format:
|
| 11 |
+
{{
|
| 12 |
+
"answer": "<your concise answer>"
|
| 13 |
+
}}
|
| 14 |
+
User query: {prompt}
|
| 15 |
+
"""
|
| 16 |
+
raw = call_model(schema_instruction)
|
| 17 |
+
try:
|
| 18 |
+
data = AnswerSchema.parse_raw(raw)
|
| 19 |
+
return json.dumps(data.dict(), indent=2) + "\n✅ Schema valid"
|
| 20 |
+
except ValidationError as e:
|
| 21 |
+
return raw + f"\n⚠️ Schema validation failed\n{e}"
|
mechanisms/historical.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import difflib
|
| 2 |
+
from utils.llm_utils import call_model
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
def historical(prompt: str) -> str:
|
| 6 |
+
best_match, best_ratio = None, 0.0
|
| 7 |
+
for old_q, old_a in st.session_state.history.items():
|
| 8 |
+
ratio = difflib.SequenceMatcher(None, prompt, old_q).ratio()
|
| 9 |
+
if ratio > best_ratio:
|
| 10 |
+
best_match, best_ratio = old_q, ratio
|
| 11 |
+
if best_match and best_ratio > 0.8:
|
| 12 |
+
return st.session_state.history[best_match] + f"\n✅ Historical match (from: '{best_match}')"
|
| 13 |
+
else:
|
| 14 |
+
ans = call_model(prompt)
|
| 15 |
+
st.session_state.history[prompt] = ans
|
| 16 |
+
return ans + "\n💡 Stored in history"
|
mechanisms/predictability.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import difflib
|
| 2 |
+
from utils.llm_utils import call_model
|
| 3 |
+
|
| 4 |
+
def predictability(prompt: str, runs: int = 3) -> str:
|
| 5 |
+
answers = [call_model(prompt) for _ in range(runs)]
|
| 6 |
+
ratios = []
|
| 7 |
+
for i in range(len(answers)):
|
| 8 |
+
for j in range(i + 1, len(answers)):
|
| 9 |
+
ratio = difflib.SequenceMatcher(None, answers[i], answers[j]).ratio()
|
| 10 |
+
ratios.append(ratio)
|
| 11 |
+
score = round(sum(ratios) / len(ratios) * 100, 2)
|
| 12 |
+
return (
|
| 13 |
+
f"Answers across {runs} runs:\n" +
|
| 14 |
+
"\n".join([f"- Run {i+1}: {ans}" for i, ans in enumerate(answers)]) +
|
| 15 |
+
f"\n\n🔢 Predictability Index: {score}%"
|
| 16 |
+
)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
python-dotenv
|
| 3 |
+
openai>=1.30.0
|
| 4 |
+
pydantic>=1.10,<2.0
|
| 5 |
+
|
utils/__pycache__/llm_utils.cpython-311.pyc
ADDED
|
Binary file (1.04 kB). View file
|
|
|
utils/llm_utils.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
from config import client
|
| 3 |
+
|
| 4 |
+
def call_model(prompt: str, model="llama-3.1-8b-instant") -> str:
|
| 5 |
+
response = client.responses.create(
|
| 6 |
+
model=model,
|
| 7 |
+
input=prompt
|
| 8 |
+
)
|
| 9 |
+
return response.output_text.strip()
|
| 10 |
+
|
| 11 |
+
def get_hash(text: str) -> str:
|
| 12 |
+
return hashlib.sha256(text.encode()).hexdigest()
|