Spaces:
Sleeping
Sleeping
| # Import required libraries | |
| import os | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # MongoDB Configuration# settings.py | |
| # Path to the directory where the fine-tuned reranker model will be saved | |
| # This is used to store the model after fine-tuning, so it can be loaded later for inference | |
| FINE_TUNED_RERANKER_SAVE_PATH = "models/reranker_fine_tuned" | |
| # Connection string for MongoDB Atlas database | |
| MONGO_URI = os.getenv("MONGO_URI", "mongodb+srv://sundram22verma:Inform12345@newsfeeddataset.hawox3o.mongodb.net/NewsDataSet?retryWrites=true&w=majority") | |
| # Name of the MongoDB database | |
| MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", "NewsDataSet") | |
| # Name of the collection storing news articles (parsed XML articles) | |
| MONGO_NEWS_COLLECTION_NAME = os.getenv("MONGO_NEWS_COLLECTION_NAME", "parsedXmlArticles") | |
| # Name of the collection storing user session data | |
| MONGO_SESSIONS_COLLECTION_NAME = os.getenv("MONGO_SESSIONS_COLLECTION_NAME", "user_sessions") | |
| # Name of the collection storing FAISS index metadata (like indexed IDs) | |
| MONGO_FAISS_META_COLLECTION_NAME = os.getenv("MONGO_FAISS_META_COLLECTION_NAME", "faiss_index_meta") | |
| # Name of the collection storing user feedback/tracking data | |
| MONGO_TRACKING_COLLECTION_NAME = os.getenv("MONGO_TRACKING_COLLECTION_NAME", "user_feedback_tracking") | |
| # Model Configuration | |
| # Name of the embedding model used for text vectorization | |
| EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2" | |
| # Name of the text generation model for Indic languages | |
| GENERATOR_MODEL_NAME = "ai4bharat/IndicBART" | |
| # Name of the reranking model for improving search results | |
| RERANKER_MODEL_NAME = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" | |
| # File Paths | |
| # Path to store the FAISS index for fast similarity search | |
| INDEX_PATH = "DataEmbeddings.bin" | |
| # Path to store the list of IDs corresponding to the FAISS index vectors (legacy, now primarily in MongoDB) | |
| INDEX_IDS_PATH = "DataEmbeddings_ids.json" | |
| # Path to store user interaction logs | |
| INTERACTION_LOG_PATH = "logs/Hindi_User_Interactions.json" | |
| # Path to Indic NLP resources for text processing | |
| # Use relative path from project root | |
| _DEFAULT_INDIC_NLP_RESOURCES_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "indic_nlp_resources") | |
| _ENV_SUPPLIED_INDIC_NLP_PATH = os.getenv("INDIC_NLP_RESOURCES_PATH") | |
| if _ENV_SUPPLIED_INDIC_NLP_PATH: | |
| if os.path.exists(_ENV_SUPPLIED_INDIC_NLP_PATH): | |
| INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH | |
| elif os.path.exists(_DEFAULT_INDIC_NLP_RESOURCES_PATH): | |
| print(f"WARNING: INDIC_NLP_RESOURCES_PATH from environment ('{_ENV_SUPPLIED_INDIC_NLP_PATH}') not found or invalid.") | |
| print(f"Falling back to default path: '{_DEFAULT_INDIC_NLP_RESOURCES_PATH}'") | |
| INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH | |
| else: | |
| # Environment path is set but invalid, and default path is also invalid. Let it fail with the env path. | |
| INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH | |
| else: | |
| # Environment variable not set, use the default. | |
| INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH | |
| # Column Names | |
| # Column name for article headlines | |
| HEADLINE_COL = "hl" | |
| # Column name for SEO location / URL | |
| SEOLOCATION_COL = "seolocation" | |
| # Column name for deeplink | |
| DEEPLINK_COL = "dl" | |
| # Column name for last updated | |
| LAST_UPDATED_COL = "lu" | |
| # Column name for image ID | |
| IMAGE_ID_COL = "imageid" | |
| # Column name for image ratio | |
| IMAGE_RATIO_COL = "imgratio" | |
| # Column name for image size | |
| IMAGE_SIZE_COL = "imgsize" | |
| SYN_COL = "syn" | |
| KEY_COL= "key" | |
| # Column name for article IDs | |
| ID_COL = "id" | |
| # Column name for article topics | |
| TOPIC_COL = "tn" | |
| # Column name for taxonomy (list of objects with 'name' and 'code') | |
| TAXONOMY_COL = "tx" | |
| # Column name for article source/property | |
| PROPERTY_COL = "host" | |
| # API Configuration | |
| # Title of the API service | |
| API_TITLE = "RAG Recommendation API" | |
| # Description of the API service | |
| API_DESCRIPTION = "API providing RAG-based recommendations for multi content, using MongoDB Atlas" | |
| # Version of the API | |
| API_VERSION = "1.0.0" | |
| # Model Parameters | |
| # Default number of recommendations to return | |
| DEFAULT_K = 5 | |
| # Threshold for similarity matching | |
| SIMILARITY_THRESHOLD = -8.0 | |
| # Multiplier for number of candidates to consider before reranking | |
| CANDIDATE_MULTIPLIER = 3 |