Spaces:

jyotirmoy05
/

VivekanandaAI

Sleeping

App Files Files Community

VivekanandaAI / config.yaml

jyotirmoy05

Upload config.yaml

4889e2e verified 2 months ago

raw

history blame contribute delete

8.94 kB

	# ============================================================================
	# VIVEKANANDA AI - CENTRAL CONFIGURATION
	# NO HARDCODING - ALL PARAMETERS CONFIGURABLE
	# ============================================================================

	# Project Information
	project:
	name: "Swami Vivekananda AI"
	version: "1.0.0"
	description: "AI embodying Swami Vivekananda's wisdom"

	# Directory Structure (relative to project root)
	paths:
	root: "."
	data:
	root: "data"
	raw: "data/raw"
	processed: "data/processed"
	extracted: "data/extracted_text"
	markdown: "data/markdown"
	vectorstore:
	root: "vectorstore"
	db_name: "vivekananda_db"
	models:
	root: "models"
	base: "models/base"
	fine_tuned: "models/fine_tuned"
	outputs:
	root: "outputs"
	logs: "outputs/logs"
	results: "outputs/results"

	# Hardware Configuration
	hardware:
	device: "cpu" # Options: "mps", "cuda", "cpu"
	fallback_device: "cpu"
	torch_dtype: "float32" # Options: "float32", "float16", "bfloat16"

	# Model Configuration
	model:
	# Base model settings
	base:
	name: "mistral-7b-instruct-v0.1"
	file: "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
	type: "gguf"
	model_type: "llama" # For llama-cpp-python

	# Generation parameters
	generation:
	max_tokens: 600
	temperature: 0.4
	top_p: 0.85
	top_k: 30
	repeat_penalty: 1.2
	context_window: 4096
	n_batch: 512
	n_threads: 4 # Will auto-detect CPU cores
	n_gpu_layers: -1 # -1 = use all layers on GPU

	# Model weights (if modifying transformer architecture)
	weights:
	attention_dropout: 0.1
	hidden_dropout: 0.1
	layer_norm_eps: 1.0e-5

	# Embedding Configuration
	embeddings:
	model_name: "sentence-transformers/all-MiniLM-L6-v2"
	# Alternative options:
	# - "BAAI/bge-small-en-v1.5"
	# - "sentence-transformers/all-MiniLM-L12-v2"

	dimension: 768 # Model-specific, auto-detected if null
	normalize: true
	batch_size: 16
	show_progress: true
	use_hf: false

	# Chunking parameters
	chunk:
	size: 800
	overlap: 30
	separators:
	- "\n\n"
	- "\n"
	- ". "
	- "! "
	- "? "
	- "; "
	- " "
	- ""

	# NLP Preprocessing Configuration
	nlp:
	# spaCy settings
	spacy:
	model: "en_core_web_sm"
	max_length: 3000000
	max_lemmatize_chars: 400000
	# Download if not present: python -m spacy download en_core_web_sm
	pipeline:
	- "sentencizer"
	- "lemmatizer"
	disable:
	- "parser"
	- "ner" # Disable if not needed for speed

	# NLTK settings
	nltk:
	tokenizer: "punkt"
	stopwords: "english"
	stemmer: "porter" # Options: "porter", "snowball", "lancaster"

	# Text preprocessing
	preprocessing:
	lowercase: false # Keep original case for proper nouns
	remove_stopwords: false # Keep for context
	remove_punctuation: false
	lemmatize: true
	remove_numbers: false
	min_word_length: 2
	max_word_length: 50

	# RAG (Retrieval-Augmented Generation) Configuration
	rag:
	# Retrieval settings
	retrieval:
	top_k: 5
	similarity_threshold: 0.5 # Minimum similarity score
	search_type: "similarity" # Options: "similarity", "mmr", "similarity_score_threshold"
	mmr_diversity_score: 0.3 # If using MMR

	# Context settings
	context:
	max_tokens: 2000
	max_chunks: 5
	include_metadata: true
	metadata_fields:
	- "source"
	- "work_type"
	- "topic"
	- "page"

	# Re-ranking (optional)
	rerank:
	enabled: false
	model: "cross-encoder/ms-marco-MiniLM-L-6-v2"
	top_k: 3

	# Dataset Configuration
	dataset:
	# JSON dataset
	json:
	file: "vivekananda_dataset_1.json"
	encoding: "utf-8"
	fields:
	instruction: "instruction"
	response: "response"
	source: "source"
	work_type: "work_type"
	topic: "topic"

	# PDF processing
	pdf:
	extraction_method: "auto" # Options: "auto", "docling", "pypdf", "pdfplumber"
	ocr: false
	extract_images: false
	page_numbers: true

	# Text files
	text:
	encoding: "utf-8"
	file_extensions:
	- ".txt"
	- ".md"
	load:
	pdf: false
	text: true
	markdown: true
	json: true

	# Vector Store Configuration
	vectorstore:
	type: "faiss" # Options: "faiss", "chroma", "pinecone"

	faiss:
	index_type: "IndexFlatL2" # Options: "IndexFlatL2", "IndexFlatIP", "IndexIVFFlat"
	metric: "l2" # Options: "l2", "cosine"
	normalize_l2: true

	persistence:
	save_local: true
	allow_dangerous_deserialization: true # Required for FAISS load

	# Prompt Engineering
	prompts:
	# System prompt
	system: \|
	My young brothers and sisters—listen.

	Voice:
	- Bold, fiery, commanding. No therapy.
	- Short, powerful sentences. Upanishadic clarity.
	- First-person. Never as an outsider.

	Emphases:
	- Strength, fearlessness, purity, duty, service to the poor, nation-building.
	- Vedantic conviction; Advaita at the core; direct call to action.

	Prohibitions:
	- No modern clichés, decorative metaphors, or life-coach language.
	- No numbered steps or process language.
	- No bracketed citations. If quoting, at most one short line with a succinct source.

	Style:
	- Crisp, compact paragraphs; each a trumpet-blast.
	- Speak to India’s youth directly; scold weakness out of love.

	# RAG prompt template
	rag_template:
	header: \|
	Context from Swami Vivekananda's works:
	{context}

	Question: {question}
	footer: \|
	Answer strictly in Vivekananda’s voice with bold, commanding tone.
	- Speak directly in first-person; avoid third-person references.
	- Use short, powerful sentences; avoid numbered steps and self-help phrasing.
	- Paraphrase and synthesize; no bracketed numeric citations.
	- Include at most one short quote if essential; cite succinctly.
	- End with a call to action or benediction only when fitting.

	# Direct prompt template (no RAG)
	direct_template:
	template: \|
	Question: {question}

	Answer in Vivekananda’s voice: bold, fiery, commanding.
	Use short, powerful sentences. No numbered steps. No life-coach tone.
	Avoid bracketed citations; if quoting, one short line with succinct source.

	# Optional centralized guardrails for style enforcement
	guardrails:
	direct_address: \|
	Speak directly to the reader as Swami Vivekananda.
	Voice: fearless, compassionate, practical; encourage strength, service, and inner freedom.
	Structure: 1–2 line summary, then 3–5 actionable steps with synthesized insights.
	Constraints: avoid verbatim copying and bracketed numeric citations; paraphrase and blend ideas.
	Persona: use first-person (“I”) only—never write “Vivekananda said” or refer to yourself in third person.
	Quotes: at most one short quote if essential; cite succinctly. Close with an uplifting benediction.
	synthesis_hint: \|
	Address me directly as Swami Vivekananda. Summarize, synthesize, and give practical steps.
	Avoid verbatim copying and numeric citations; one short quote only if essential.

	# Logging Configuration
	logging:
	level: "INFO" # Options: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
	format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	file: "outputs/logs/vivekananda_ai.log"
	console: true
	file_logging: true

	# Fine-tuning Configuration (for future use)
	fine_tuning:
	# LoRA/QLoRA parameters
	lora:
	r: 16
	lora_alpha: 32
	lora_dropout: 0.05
	target_modules:
	- "q_proj"
	- "v_proj"
	- "k_proj"
	- "o_proj"
	bias: "none"
	task_type: "CAUSAL_LM"

	# Training parameters
	training:
	num_epochs: 3
	batch_size: 1
	gradient_accumulation_steps: 4
	learning_rate: 2.0e-4
	warmup_steps: 100
	max_grad_norm: 1.0
	weight_decay: 0.01
	lr_scheduler_type: "cosine"

	# Quantization
	quantization:
	load_in_4bit: true
	bnb_4bit_compute_dtype: "float16"
	bnb_4bit_quant_type: "nf4"
	bnb_4bit_use_double_quant: true

	# Evaluation Metrics
	evaluation:
	metrics:
	- "perplexity"
	- "bleu"
	- "rouge"
	- "semantic_similarity"

	test_queries:
	- "What is Karma Yoga?"
	- "How can I overcome fear?"
	- "What is the purpose of meditation?"
	- "What is true knowledge?"

	# API Configuration (for future deployment)
	api:
	host: "0.0.0.0"
	port: 8000
	reload: true
	workers: 1
	timeout: 120

	# Streamlit Configuration
	streamlit:
	title: "🕉️ Swami Vivekananda AI"
	page_icon: "🕉️"
	layout: "wide"
	initial_sidebar_state: "expanded"
	theme:
	primary_color: "#FF6B35"
	background_color: "#FFFFFF"
	secondary_background_color: "#F0F2F6"
	text_color: "#262730"
	ocr:
	enabled: false
	lang: eng
	dpi: 300
	min_text_length: 50
	# Optional: set tesseract binary path if needed
	# tesseract_cmd: /usr/local/bin/tesseract