File size: 4,042 Bytes
3998131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
Configuration module for Module A
Contains all settings, paths, and parameters
"""

import os
from pathlib import Path
import re

# Load environment variables from .env file if it exists
try:
    from dotenv import load_dotenv
    # Load .env from project root (parent of module_a)
    _BASE_DIR = Path(__file__).parent.parent
    env_file = _BASE_DIR / ".env"
    if env_file.exists():
        load_dotenv(env_file)
    else:
        # Also try loading from current directory
        load_dotenv()
except ImportError:
    # python-dotenv not installed, skip .env loading
    pass

# Base paths
BASE_DIR = Path(__file__).parent.parent
DATA_DIR = BASE_DIR / "data" / "module-A"
LAW_DIR = DATA_DIR / "law"
CHUNKS_DIR = DATA_DIR / "chunks"
LOG_DIR = DATA_DIR / "logs"

# Ensure output directory exists
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

# Output file
CHUNKS_OUTPUT_FILE = CHUNKS_DIR / "processed_chunks.json"

# Chunking parameters
CHUNK_SIZE_MIN_WORDS = 300
CHUNK_SIZE_MAX_WORDS = 600
CHUNK_SIZE_TARGET_WORDS = 450
CHUNK_OVERLAP_WORDS = 50  # Overlap for context preservation

# Token estimation (rough: 1 word ≈ 1.3 tokens)
CHUNK_SIZE_MIN_TOKENS = int(CHUNK_SIZE_MIN_WORDS * 1.3)
CHUNK_SIZE_MAX_TOKENS = int(CHUNK_SIZE_MAX_WORDS * 1.3)

# Text cleaning patterns
CLEANING_PATTERNS = {
    # Page numbers (various formats)
    'page_numbers': [
        r'^\s*\d+\s*$',  # Standalone numbers
        r'Page\s+\d+',
        r'पृष्ठ\s+\d+',
    ],
    
    # Headers and footers
    'headers_footers': [
        r'www\..*?\.gov\.np',
        r'Constitution of Nepal.*?\d{4}',
        r'Nepal Gazette.*?Part.*?\d+',
        r'©.*?Government of Nepal',
    ],
    
    # Table of contents patterns
    'toc_patterns': [
        r'Table of Contents',
        r'CONTENTS',
        r'विषयसूची',
    ],
    
    # Excessive whitespace
    'whitespace': [
        r'\n\s*\n\s*\n+',  # Multiple blank lines
        r'[ \t]+',  # Multiple spaces/tabs
    ],
}

# Section/Article detection patterns
SECTION_PATTERNS = [
    # Numbered sections at start of line: "11. Right to citizenship:"
    r'^\s*(\d+[A-Za-z]?)\.\s+([A-Z][^:]+):',
    
    # Article patterns: "Article 11", "ARTICLE 11"
    r'^\s*(?:Article|ARTICLE)\s+(\d+[A-Za-z]?)',
    
    # Section patterns: "Section 8", "SECTION 8"  
    r'^\s*(?:Section|SECTION)\s+(\d+[A-Za-z]?)',
    
    # Part patterns: "Part 4", "PART 4"
    r'^\s*(?:Part|PART)\s+(\d+[A-Za-z]?)',
    
    # Chapter patterns: "Chapter 3", "CHAPTER 3"
    r'^\s*(?:Chapter|CHAPTER)\s+(\d+[A-Za-z]?)',
    
    # Nepali patterns (if needed later)
    r'^\s*धारा\s+(\d+[A-Za-z]?)',
    r'^\s*अनुच्छेद\s+(\d+[A-Za-z]?)',
]

# Compile regex patterns for efficiency
COMPILED_SECTION_PATTERNS = [re.compile(pattern, re.IGNORECASE) for pattern in SECTION_PATTERNS]

# Logging configuration
LOG_LEVEL = "INFO"
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
LOG_FILE = LOG_DIR / "pinecone.log"
LOG_FILE_MAX_BYTES = 10 * 1024 * 1024  # 10MB
LOG_FILE_BACKUP_COUNT = 5  # Keep 5 backup files

# PDF extraction settings
PDF_EXTRACTION_METHOD = "pdfplumber"  # Options: "pdfplumber", "pypdf2"
PDF_FALLBACK_METHOD = "pypdf2"  # Fallback if primary fails

# Vector database settings (Step 3)
VECTOR_DB_DIR = DATA_DIR / "vector_db"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIMENSION = 384  # For all-MiniLM-L6-v2
EMBEDDING_BATCH_SIZE = 32

# Pinecone settings - Read from environment or set here
# Get your API key from: https://app.pinecone.io/
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "nepal-legal-docs")
PINECONE_TEXT_STORAGE_FILE = DATA_DIR / "pinecone_text_storage.json"


# Retrieval settings
DEFAULT_RETRIEVAL_K = 5  # Number of chunks to retrieve

# LLM settings (Step 4)
MISTRAL_MODEL = "mistral-tiny"  # Options: mistral-tiny, mistral-small, mistral-medium
MISTRAL_API_KEY_ENV_VAR = "MISTRAL_API_KEY"