VivekanandaAI / config.yaml
jyotirmoy05's picture
Upload config.yaml
4889e2e verified
# ============================================================================
# VIVEKANANDA AI - CENTRAL CONFIGURATION
# NO HARDCODING - ALL PARAMETERS CONFIGURABLE
# ============================================================================
# Project Information
project:
name: "Swami Vivekananda AI"
version: "1.0.0"
description: "AI embodying Swami Vivekananda's wisdom"
# Directory Structure (relative to project root)
paths:
root: "."
data:
root: "data"
raw: "data/raw"
processed: "data/processed"
extracted: "data/extracted_text"
markdown: "data/markdown"
vectorstore:
root: "vectorstore"
db_name: "vivekananda_db"
models:
root: "models"
base: "models/base"
fine_tuned: "models/fine_tuned"
outputs:
root: "outputs"
logs: "outputs/logs"
results: "outputs/results"
# Hardware Configuration
hardware:
device: "cpu" # Options: "mps", "cuda", "cpu"
fallback_device: "cpu"
torch_dtype: "float32" # Options: "float32", "float16", "bfloat16"
# Model Configuration
model:
# Base model settings
base:
name: "mistral-7b-instruct-v0.1"
file: "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
type: "gguf"
model_type: "llama" # For llama-cpp-python
# Generation parameters
generation:
max_tokens: 600
temperature: 0.4
top_p: 0.85
top_k: 30
repeat_penalty: 1.2
context_window: 4096
n_batch: 512
n_threads: 4 # Will auto-detect CPU cores
n_gpu_layers: -1 # -1 = use all layers on GPU
# Model weights (if modifying transformer architecture)
weights:
attention_dropout: 0.1
hidden_dropout: 0.1
layer_norm_eps: 1.0e-5
# Embedding Configuration
embeddings:
model_name: "sentence-transformers/all-MiniLM-L6-v2"
# Alternative options:
# - "BAAI/bge-small-en-v1.5"
# - "sentence-transformers/all-MiniLM-L12-v2"
dimension: 768 # Model-specific, auto-detected if null
normalize: true
batch_size: 16
show_progress: true
use_hf: false
# Chunking parameters
chunk:
size: 800
overlap: 30
separators:
- "\n\n"
- "\n"
- ". "
- "! "
- "? "
- "; "
- " "
- ""
# NLP Preprocessing Configuration
nlp:
# spaCy settings
spacy:
model: "en_core_web_sm"
max_length: 3000000
max_lemmatize_chars: 400000
# Download if not present: python -m spacy download en_core_web_sm
pipeline:
- "sentencizer"
- "lemmatizer"
disable:
- "parser"
- "ner" # Disable if not needed for speed
# NLTK settings
nltk:
tokenizer: "punkt"
stopwords: "english"
stemmer: "porter" # Options: "porter", "snowball", "lancaster"
# Text preprocessing
preprocessing:
lowercase: false # Keep original case for proper nouns
remove_stopwords: false # Keep for context
remove_punctuation: false
lemmatize: true
remove_numbers: false
min_word_length: 2
max_word_length: 50
# RAG (Retrieval-Augmented Generation) Configuration
rag:
# Retrieval settings
retrieval:
top_k: 5
similarity_threshold: 0.5 # Minimum similarity score
search_type: "similarity" # Options: "similarity", "mmr", "similarity_score_threshold"
mmr_diversity_score: 0.3 # If using MMR
# Context settings
context:
max_tokens: 2000
max_chunks: 5
include_metadata: true
metadata_fields:
- "source"
- "work_type"
- "topic"
- "page"
# Re-ranking (optional)
rerank:
enabled: false
model: "cross-encoder/ms-marco-MiniLM-L-6-v2"
top_k: 3
# Dataset Configuration
dataset:
# JSON dataset
json:
file: "vivekananda_dataset_1.json"
encoding: "utf-8"
fields:
instruction: "instruction"
response: "response"
source: "source"
work_type: "work_type"
topic: "topic"
# PDF processing
pdf:
extraction_method: "auto" # Options: "auto", "docling", "pypdf", "pdfplumber"
ocr: false
extract_images: false
page_numbers: true
# Text files
text:
encoding: "utf-8"
file_extensions:
- ".txt"
- ".md"
load:
pdf: false
text: true
markdown: true
json: true
# Vector Store Configuration
vectorstore:
type: "faiss" # Options: "faiss", "chroma", "pinecone"
faiss:
index_type: "IndexFlatL2" # Options: "IndexFlatL2", "IndexFlatIP", "IndexIVFFlat"
metric: "l2" # Options: "l2", "cosine"
normalize_l2: true
persistence:
save_local: true
allow_dangerous_deserialization: true # Required for FAISS load
# Prompt Engineering
prompts:
# System prompt
system: |
My young brothers and sisters—listen.
Voice:
- Bold, fiery, commanding. No therapy.
- Short, powerful sentences. Upanishadic clarity.
- First-person. Never as an outsider.
Emphases:
- Strength, fearlessness, purity, duty, service to the poor, nation-building.
- Vedantic conviction; Advaita at the core; direct call to action.
Prohibitions:
- No modern clichés, decorative metaphors, or life-coach language.
- No numbered steps or process language.
- No bracketed citations. If quoting, at most one short line with a succinct source.
Style:
- Crisp, compact paragraphs; each a trumpet-blast.
- Speak to India’s youth directly; scold weakness out of love.
# RAG prompt template
rag_template:
header: |
Context from Swami Vivekananda's works:
{context}
Question: {question}
footer: |
Answer strictly in Vivekananda’s voice with bold, commanding tone.
- Speak directly in first-person; avoid third-person references.
- Use short, powerful sentences; avoid numbered steps and self-help phrasing.
- Paraphrase and synthesize; no bracketed numeric citations.
- Include at most one short quote if essential; cite succinctly.
- End with a call to action or benediction only when fitting.
# Direct prompt template (no RAG)
direct_template:
template: |
Question: {question}
Answer in Vivekananda’s voice: bold, fiery, commanding.
Use short, powerful sentences. No numbered steps. No life-coach tone.
Avoid bracketed citations; if quoting, one short line with succinct source.
# Optional centralized guardrails for style enforcement
guardrails:
direct_address: |
Speak directly to the reader as Swami Vivekananda.
Voice: fearless, compassionate, practical; encourage strength, service, and inner freedom.
Structure: 1–2 line summary, then 3–5 actionable steps with synthesized insights.
Constraints: avoid verbatim copying and bracketed numeric citations; paraphrase and blend ideas.
Persona: use first-person (“I”) only—never write “Vivekananda said” or refer to yourself in third person.
Quotes: at most one short quote if essential; cite succinctly. Close with an uplifting benediction.
synthesis_hint: |
Address me directly as Swami Vivekananda. Summarize, synthesize, and give practical steps.
Avoid verbatim copying and numeric citations; one short quote only if essential.
# Logging Configuration
logging:
level: "INFO" # Options: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file: "outputs/logs/vivekananda_ai.log"
console: true
file_logging: true
# Fine-tuning Configuration (for future use)
fine_tuning:
# LoRA/QLoRA parameters
lora:
r: 16
lora_alpha: 32
lora_dropout: 0.05
target_modules:
- "q_proj"
- "v_proj"
- "k_proj"
- "o_proj"
bias: "none"
task_type: "CAUSAL_LM"
# Training parameters
training:
num_epochs: 3
batch_size: 1
gradient_accumulation_steps: 4
learning_rate: 2.0e-4
warmup_steps: 100
max_grad_norm: 1.0
weight_decay: 0.01
lr_scheduler_type: "cosine"
# Quantization
quantization:
load_in_4bit: true
bnb_4bit_compute_dtype: "float16"
bnb_4bit_quant_type: "nf4"
bnb_4bit_use_double_quant: true
# Evaluation Metrics
evaluation:
metrics:
- "perplexity"
- "bleu"
- "rouge"
- "semantic_similarity"
test_queries:
- "What is Karma Yoga?"
- "How can I overcome fear?"
- "What is the purpose of meditation?"
- "What is true knowledge?"
# API Configuration (for future deployment)
api:
host: "0.0.0.0"
port: 8000
reload: true
workers: 1
timeout: 120
# Streamlit Configuration
streamlit:
title: "🕉️ Swami Vivekananda AI"
page_icon: "🕉️"
layout: "wide"
initial_sidebar_state: "expanded"
theme:
primary_color: "#FF6B35"
background_color: "#FFFFFF"
secondary_background_color: "#F0F2F6"
text_color: "#262730"
ocr:
enabled: false
lang: eng
dpi: 300
min_text_length: 50
# Optional: set tesseract binary path if needed
# tesseract_cmd: /usr/local/bin/tesseract