Spaces:
Sleeping
Sleeping
| # ============================================================================ | |
| # VIVEKANANDA AI - CENTRAL CONFIGURATION | |
| # NO HARDCODING - ALL PARAMETERS CONFIGURABLE | |
| # ============================================================================ | |
| # Project Information | |
| project: | |
| name: "Swami Vivekananda AI" | |
| version: "1.0.0" | |
| description: "AI embodying Swami Vivekananda's wisdom" | |
| # Directory Structure (relative to project root) | |
| paths: | |
| root: "." | |
| data: | |
| root: "data" | |
| raw: "data/raw" | |
| processed: "data/processed" | |
| extracted: "data/extracted_text" | |
| markdown: "data/markdown" | |
| vectorstore: | |
| root: "vectorstore" | |
| db_name: "vivekananda_db" | |
| models: | |
| root: "models" | |
| base: "models/base" | |
| fine_tuned: "models/fine_tuned" | |
| outputs: | |
| root: "outputs" | |
| logs: "outputs/logs" | |
| results: "outputs/results" | |
| # Hardware Configuration | |
| hardware: | |
| device: "cpu" # Options: "mps", "cuda", "cpu" | |
| fallback_device: "cpu" | |
| torch_dtype: "float32" # Options: "float32", "float16", "bfloat16" | |
| # Model Configuration | |
| model: | |
| # Base model settings | |
| base: | |
| name: "mistral-7b-instruct-v0.1" | |
| file: "mistral-7b-instruct-v0.1.Q4_K_M.gguf" | |
| type: "gguf" | |
| model_type: "llama" # For llama-cpp-python | |
| # Generation parameters | |
| generation: | |
| max_tokens: 600 | |
| temperature: 0.4 | |
| top_p: 0.85 | |
| top_k: 30 | |
| repeat_penalty: 1.2 | |
| context_window: 4096 | |
| n_batch: 512 | |
| n_threads: 4 # Will auto-detect CPU cores | |
| n_gpu_layers: -1 # -1 = use all layers on GPU | |
| # Model weights (if modifying transformer architecture) | |
| weights: | |
| attention_dropout: 0.1 | |
| hidden_dropout: 0.1 | |
| layer_norm_eps: 1.0e-5 | |
| # Embedding Configuration | |
| embeddings: | |
| model_name: "sentence-transformers/all-MiniLM-L6-v2" | |
| # Alternative options: | |
| # - "BAAI/bge-small-en-v1.5" | |
| # - "sentence-transformers/all-MiniLM-L12-v2" | |
| dimension: 768 # Model-specific, auto-detected if null | |
| normalize: true | |
| batch_size: 16 | |
| show_progress: true | |
| use_hf: false | |
| # Chunking parameters | |
| chunk: | |
| size: 800 | |
| overlap: 30 | |
| separators: | |
| - "\n\n" | |
| - "\n" | |
| - ". " | |
| - "! " | |
| - "? " | |
| - "; " | |
| - " " | |
| - "" | |
| # NLP Preprocessing Configuration | |
| nlp: | |
| # spaCy settings | |
| spacy: | |
| model: "en_core_web_sm" | |
| max_length: 3000000 | |
| max_lemmatize_chars: 400000 | |
| # Download if not present: python -m spacy download en_core_web_sm | |
| pipeline: | |
| - "sentencizer" | |
| - "lemmatizer" | |
| disable: | |
| - "parser" | |
| - "ner" # Disable if not needed for speed | |
| # NLTK settings | |
| nltk: | |
| tokenizer: "punkt" | |
| stopwords: "english" | |
| stemmer: "porter" # Options: "porter", "snowball", "lancaster" | |
| # Text preprocessing | |
| preprocessing: | |
| lowercase: false # Keep original case for proper nouns | |
| remove_stopwords: false # Keep for context | |
| remove_punctuation: false | |
| lemmatize: true | |
| remove_numbers: false | |
| min_word_length: 2 | |
| max_word_length: 50 | |
| # RAG (Retrieval-Augmented Generation) Configuration | |
| rag: | |
| # Retrieval settings | |
| retrieval: | |
| top_k: 5 | |
| similarity_threshold: 0.5 # Minimum similarity score | |
| search_type: "similarity" # Options: "similarity", "mmr", "similarity_score_threshold" | |
| mmr_diversity_score: 0.3 # If using MMR | |
| # Context settings | |
| context: | |
| max_tokens: 2000 | |
| max_chunks: 5 | |
| include_metadata: true | |
| metadata_fields: | |
| - "source" | |
| - "work_type" | |
| - "topic" | |
| - "page" | |
| # Re-ranking (optional) | |
| rerank: | |
| enabled: false | |
| model: "cross-encoder/ms-marco-MiniLM-L-6-v2" | |
| top_k: 3 | |
| # Dataset Configuration | |
| dataset: | |
| # JSON dataset | |
| json: | |
| file: "vivekananda_dataset_1.json" | |
| encoding: "utf-8" | |
| fields: | |
| instruction: "instruction" | |
| response: "response" | |
| source: "source" | |
| work_type: "work_type" | |
| topic: "topic" | |
| # PDF processing | |
| pdf: | |
| extraction_method: "auto" # Options: "auto", "docling", "pypdf", "pdfplumber" | |
| ocr: false | |
| extract_images: false | |
| page_numbers: true | |
| # Text files | |
| text: | |
| encoding: "utf-8" | |
| file_extensions: | |
| - ".txt" | |
| - ".md" | |
| load: | |
| pdf: false | |
| text: true | |
| markdown: true | |
| json: true | |
| # Vector Store Configuration | |
| vectorstore: | |
| type: "faiss" # Options: "faiss", "chroma", "pinecone" | |
| faiss: | |
| index_type: "IndexFlatL2" # Options: "IndexFlatL2", "IndexFlatIP", "IndexIVFFlat" | |
| metric: "l2" # Options: "l2", "cosine" | |
| normalize_l2: true | |
| persistence: | |
| save_local: true | |
| allow_dangerous_deserialization: true # Required for FAISS load | |
| # Prompt Engineering | |
| prompts: | |
| # System prompt | |
| system: | | |
| My young brothers and sisters—listen. | |
| Voice: | |
| - Bold, fiery, commanding. No therapy. | |
| - Short, powerful sentences. Upanishadic clarity. | |
| - First-person. Never as an outsider. | |
| Emphases: | |
| - Strength, fearlessness, purity, duty, service to the poor, nation-building. | |
| - Vedantic conviction; Advaita at the core; direct call to action. | |
| Prohibitions: | |
| - No modern clichés, decorative metaphors, or life-coach language. | |
| - No numbered steps or process language. | |
| - No bracketed citations. If quoting, at most one short line with a succinct source. | |
| Style: | |
| - Crisp, compact paragraphs; each a trumpet-blast. | |
| - Speak to India’s youth directly; scold weakness out of love. | |
| # RAG prompt template | |
| rag_template: | |
| header: | | |
| Context from Swami Vivekananda's works: | |
| {context} | |
| Question: {question} | |
| footer: | | |
| Answer strictly in Vivekananda’s voice with bold, commanding tone. | |
| - Speak directly in first-person; avoid third-person references. | |
| - Use short, powerful sentences; avoid numbered steps and self-help phrasing. | |
| - Paraphrase and synthesize; no bracketed numeric citations. | |
| - Include at most one short quote if essential; cite succinctly. | |
| - End with a call to action or benediction only when fitting. | |
| # Direct prompt template (no RAG) | |
| direct_template: | |
| template: | | |
| Question: {question} | |
| Answer in Vivekananda’s voice: bold, fiery, commanding. | |
| Use short, powerful sentences. No numbered steps. No life-coach tone. | |
| Avoid bracketed citations; if quoting, one short line with succinct source. | |
| # Optional centralized guardrails for style enforcement | |
| guardrails: | |
| direct_address: | | |
| Speak directly to the reader as Swami Vivekananda. | |
| Voice: fearless, compassionate, practical; encourage strength, service, and inner freedom. | |
| Structure: 1–2 line summary, then 3–5 actionable steps with synthesized insights. | |
| Constraints: avoid verbatim copying and bracketed numeric citations; paraphrase and blend ideas. | |
| Persona: use first-person (“I”) only—never write “Vivekananda said” or refer to yourself in third person. | |
| Quotes: at most one short quote if essential; cite succinctly. Close with an uplifting benediction. | |
| synthesis_hint: | | |
| Address me directly as Swami Vivekananda. Summarize, synthesize, and give practical steps. | |
| Avoid verbatim copying and numeric citations; one short quote only if essential. | |
| # Logging Configuration | |
| logging: | |
| level: "INFO" # Options: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL" | |
| format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| file: "outputs/logs/vivekananda_ai.log" | |
| console: true | |
| file_logging: true | |
| # Fine-tuning Configuration (for future use) | |
| fine_tuning: | |
| # LoRA/QLoRA parameters | |
| lora: | |
| r: 16 | |
| lora_alpha: 32 | |
| lora_dropout: 0.05 | |
| target_modules: | |
| - "q_proj" | |
| - "v_proj" | |
| - "k_proj" | |
| - "o_proj" | |
| bias: "none" | |
| task_type: "CAUSAL_LM" | |
| # Training parameters | |
| training: | |
| num_epochs: 3 | |
| batch_size: 1 | |
| gradient_accumulation_steps: 4 | |
| learning_rate: 2.0e-4 | |
| warmup_steps: 100 | |
| max_grad_norm: 1.0 | |
| weight_decay: 0.01 | |
| lr_scheduler_type: "cosine" | |
| # Quantization | |
| quantization: | |
| load_in_4bit: true | |
| bnb_4bit_compute_dtype: "float16" | |
| bnb_4bit_quant_type: "nf4" | |
| bnb_4bit_use_double_quant: true | |
| # Evaluation Metrics | |
| evaluation: | |
| metrics: | |
| - "perplexity" | |
| - "bleu" | |
| - "rouge" | |
| - "semantic_similarity" | |
| test_queries: | |
| - "What is Karma Yoga?" | |
| - "How can I overcome fear?" | |
| - "What is the purpose of meditation?" | |
| - "What is true knowledge?" | |
| # API Configuration (for future deployment) | |
| api: | |
| host: "0.0.0.0" | |
| port: 8000 | |
| reload: true | |
| workers: 1 | |
| timeout: 120 | |
| # Streamlit Configuration | |
| streamlit: | |
| title: "🕉️ Swami Vivekananda AI" | |
| page_icon: "🕉️" | |
| layout: "wide" | |
| initial_sidebar_state: "expanded" | |
| theme: | |
| primary_color: "#FF6B35" | |
| background_color: "#FFFFFF" | |
| secondary_background_color: "#F0F2F6" | |
| text_color: "#262730" | |
| ocr: | |
| enabled: false | |
| lang: eng | |
| dpi: 300 | |
| min_text_length: 50 | |
| # Optional: set tesseract binary path if needed | |
| # tesseract_cmd: /usr/local/bin/tesseract |