|
|
""" |
|
|
Configuration module for Module A |
|
|
Contains all settings, paths, and parameters |
|
|
""" |
|
|
|
|
|
import os |
|
|
from pathlib import Path |
|
|
import re |
|
|
|
|
|
|
|
|
try: |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
_BASE_DIR = Path(__file__).parent.parent |
|
|
env_file = _BASE_DIR / ".env" |
|
|
if env_file.exists(): |
|
|
load_dotenv(env_file) |
|
|
else: |
|
|
|
|
|
load_dotenv() |
|
|
except ImportError: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
BASE_DIR = Path(__file__).parent.parent |
|
|
DATA_DIR = BASE_DIR / "data" / "module-A" |
|
|
LAW_DIR = DATA_DIR / "law" |
|
|
CHUNKS_DIR = DATA_DIR / "chunks" |
|
|
LOG_DIR = DATA_DIR / "logs" |
|
|
|
|
|
|
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True) |
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
CHUNKS_OUTPUT_FILE = CHUNKS_DIR / "processed_chunks.json" |
|
|
|
|
|
|
|
|
CHUNK_SIZE_MIN_WORDS = 300 |
|
|
CHUNK_SIZE_MAX_WORDS = 600 |
|
|
CHUNK_SIZE_TARGET_WORDS = 450 |
|
|
CHUNK_OVERLAP_WORDS = 50 |
|
|
|
|
|
|
|
|
CHUNK_SIZE_MIN_TOKENS = int(CHUNK_SIZE_MIN_WORDS * 1.3) |
|
|
CHUNK_SIZE_MAX_TOKENS = int(CHUNK_SIZE_MAX_WORDS * 1.3) |
|
|
|
|
|
|
|
|
CLEANING_PATTERNS = { |
|
|
|
|
|
'page_numbers': [ |
|
|
r'^\s*\d+\s*$', |
|
|
r'Page\s+\d+', |
|
|
r'पृष्ठ\s+\d+', |
|
|
], |
|
|
|
|
|
|
|
|
'headers_footers': [ |
|
|
r'www\..*?\.gov\.np', |
|
|
r'Constitution of Nepal.*?\d{4}', |
|
|
r'Nepal Gazette.*?Part.*?\d+', |
|
|
r'©.*?Government of Nepal', |
|
|
], |
|
|
|
|
|
|
|
|
'toc_patterns': [ |
|
|
r'Table of Contents', |
|
|
r'CONTENTS', |
|
|
r'विषयसूची', |
|
|
], |
|
|
|
|
|
|
|
|
'whitespace': [ |
|
|
r'\n\s*\n\s*\n+', |
|
|
r'[ \t]+', |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
SECTION_PATTERNS = [ |
|
|
|
|
|
r'^\s*(\d+[A-Za-z]?)\.\s+([A-Z][^:]+):', |
|
|
|
|
|
|
|
|
r'^\s*(?:Article|ARTICLE)\s+(\d+[A-Za-z]?)', |
|
|
|
|
|
|
|
|
r'^\s*(?:Section|SECTION)\s+(\d+[A-Za-z]?)', |
|
|
|
|
|
|
|
|
r'^\s*(?:Part|PART)\s+(\d+[A-Za-z]?)', |
|
|
|
|
|
|
|
|
r'^\s*(?:Chapter|CHAPTER)\s+(\d+[A-Za-z]?)', |
|
|
|
|
|
|
|
|
r'^\s*धारा\s+(\d+[A-Za-z]?)', |
|
|
r'^\s*अनुच्छेद\s+(\d+[A-Za-z]?)', |
|
|
] |
|
|
|
|
|
|
|
|
COMPILED_SECTION_PATTERNS = [re.compile(pattern, re.IGNORECASE) for pattern in SECTION_PATTERNS] |
|
|
|
|
|
|
|
|
LOG_LEVEL = "INFO" |
|
|
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" |
|
|
LOG_FILE = LOG_DIR / "pinecone.log" |
|
|
LOG_FILE_MAX_BYTES = 10 * 1024 * 1024 |
|
|
LOG_FILE_BACKUP_COUNT = 5 |
|
|
|
|
|
|
|
|
PDF_EXTRACTION_METHOD = "pdfplumber" |
|
|
PDF_FALLBACK_METHOD = "pypdf2" |
|
|
|
|
|
|
|
|
VECTOR_DB_DIR = DATA_DIR / "vector_db" |
|
|
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
EMBEDDING_DIMENSION = 384 |
|
|
EMBEDDING_BATCH_SIZE = 32 |
|
|
|
|
|
|
|
|
|
|
|
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "") |
|
|
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "nepal-legal-docs") |
|
|
PINECONE_TEXT_STORAGE_FILE = DATA_DIR / "pinecone_text_storage.json" |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_RETRIEVAL_K = 5 |
|
|
|
|
|
|
|
|
MISTRAL_MODEL = "mistral-tiny" |
|
|
MISTRAL_API_KEY_ENV_VAR = "MISTRAL_API_KEY" |
|
|
|
|
|
|