File size: 10,521 Bytes
56e1ad9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""

LexiBot Data Ingestion Script with Context Injection

Processes legal documents and uploads to Pinecone with Act-prefixed chunks.



CRITICAL: This script implements "Context Injection" to solve the section overlap issue

where queries about "Section 3" would confuse sections from different Acts.



Uses Pinecone's built-in embeddings to avoid Google API rate limits.

"""

import os
import re
import time
from pathlib import Path
from typing import List, Dict, Any
from dotenv import load_dotenv

from langchain.schema import Document
from langchain_pinecone import PineconeEmbeddings, PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

# Configuration
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "lexibot-legal-docs")
RAW_DATA_DIR = "./RawData"

# Act name mapping from filename
ACT_NAME_MAP = {
    "Consumer-Protection-Act.txt": "Consumer Protection Act, 2019",
    "IPC-SECTIONS-FOR-HARRASMENT.txt": "Indian Penal Code (Harassment Sections)",
    "Motor-Vehicles-Act.txt": "Motor Vehicles (Amendment) Act, 2019",
    "The-Proctection-Of-Women-From-Domestic-Violence.txt": "Protection of Women from Domestic Violence Act, 2005",
    "The-Protection-Of-Children-From-Sexual-Offences.txt": "Protection of Children from Sexual Offences Act, 2012",
    "The-sexual-harassment-of-women-at-workplace.txt": "Sexual Harassment of Women at Workplace Act, 2013"
}


def clean_government_headers(text: str) -> str:
    """Remove standardized government document headers and noise."""
    # Remove ministry headers
    patterns_to_remove = [
        r"MINISTRY OF LAW AND JUSTICE.*?(?=\n\n|\d+\.)",
        r"\(Legislative Department\)",
        r"New Delhi, the \d+.*?Saka\)",
        r"The following Act of Parliament.*?information:β€”",
        r"\[.*?\d{4}\.\]",
    ]
    
    for pattern in patterns_to_remove:
        text = re.sub(pattern, "", text, flags=re.DOTALL | re.IGNORECASE)
    
    return text.strip()


def split_by_legal_sections(text: str) -> List[Dict[str, str]]:
    """

    Split text by legal sections using the pattern: Start of line + Number (with optional letter suffix) + .

    Returns list of dicts with section_number and content.

    

    CRITICAL: Uses alphanumeric pattern to catch sections like "215D." or "354A."

    which are common in Motor Vehicles Act and IPC.

    """
    # Pattern matches: beginning of line, one or more digits, optional letters, followed by period
    # Examples: "10.", "215D.", "354A.", "182B."
    section_pattern = r"^(\d+[A-Za-z]*)\.\s+"
    
    sections = []
    current_section = None
    current_content = []
    
    for line in text.split("\n"):
        match = re.match(section_pattern, line)
        if match:
            # Save previous section if exists
            if current_section is not None:
                sections.append({
                    "section_number": current_section,
                    "content": "\n".join(current_content).strip()
                })
            current_section = match.group(1)
            current_content = [line]
        else:
            current_content.append(line)
    
    # Don't forget the last section
    if current_section is not None:
        sections.append({
            "section_number": current_section,
            "content": "\n".join(current_content).strip()
        })
    
    return sections


def create_context_injected_chunks(

    filename: str, 

    sections: List[Dict[str, str]],

    max_chunk_size: int = 1500

) -> List[Document]:
    """

    Create LangChain Document objects with Context Injection.

    

    CRITICAL: Prepends Act Name to every chunk to solve the section overlap issue.

    

    Format:

    Act: Motor Vehicles Act, 2019

    Section: 3

    Content: ...driving license...

    """
    act_name = ACT_NAME_MAP.get(filename, filename.replace("-", " ").replace(".txt", ""))
    documents = []
    
    for section in sections:
        section_num = section["section_number"]
        content = section["content"]
        
        # Skip very short sections (likely noise)
        if len(content) < 50:
            continue
        
        # Context Injection: Prepend Act and Section info
        injected_content = f"""Act: {act_name}

Section: {section_num}

Content: {content}"""
        
        # If content is too long, split into smaller chunks
        if len(injected_content) > max_chunk_size:
            # Split large sections while maintaining context header
            header = f"Act: {act_name}\nSection: {section_num} (continued)\nContent: "
            remaining = content
            chunk_idx = 0
            
            while remaining:
                chunk_size = max_chunk_size - len(header)
                chunk_text = remaining[:chunk_size]
                remaining = remaining[chunk_size:]
                
                doc = Document(
                    page_content=header + chunk_text,
                    metadata={
                        "source": filename,
                        "act_name": act_name,
                        "section_number": section_num,
                        "chunk_index": chunk_idx,
                        "type": "legal_section"
                    }
                )
                documents.append(doc)
                chunk_idx += 1
        else:
            doc = Document(
                page_content=injected_content,
                metadata={
                    "source": filename,
                    "act_name": act_name,
                    "section_number": section_num,
                    "chunk_index": 0,
                    "type": "legal_section"
                }
            )
            documents.append(doc)
    
    return documents


def process_all_documents() -> List[Document]:
    """Process all legal documents in RawData directory."""
    all_documents = []
    raw_data_path = Path(RAW_DATA_DIR)
    
    for txt_file in raw_data_path.glob("*.txt"):
        print(f"πŸ“„ Processing: {txt_file.name}")
        
        with open(txt_file, "r", encoding="utf-8") as f:
            raw_text = f.read()
        
        # Clean headers
        cleaned_text = clean_government_headers(raw_text)
        
        # Split by legal sections
        sections = split_by_legal_sections(cleaned_text)
        print(f"   Found {len(sections)} sections")
        
        # Create context-injected chunks
        documents = create_context_injected_chunks(txt_file.name, sections)
        print(f"   Created {len(documents)} chunks")
        
        all_documents.extend(documents)
    
    return all_documents


def initialize_pinecone():
    """Initialize Pinecone client and create index if needed."""
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Check if index exists
    existing_indexes = [idx.name for idx in pc.list_indexes()]
    
    if PINECONE_INDEX_NAME not in existing_indexes:
        print(f"πŸ”§ Creating Pinecone index: {PINECONE_INDEX_NAME}")
        pc.create_index(
            name=PINECONE_INDEX_NAME,
            dimension=768,  # Google embedding-001 dimension
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    else:
        print(f"βœ… Using existing Pinecone index: {PINECONE_INDEX_NAME}")
    
    return pc.Index(PINECONE_INDEX_NAME)


def main():
    """Main ingestion pipeline."""
    print("πŸš€ LexiBot Data Ingestion with Context Injection")
    print("=" * 50)
    
    # Validate environment
    if not PINECONE_API_KEY:
        raise ValueError("PINECONE_API_KEY not set in environment")
    
    # Process documents
    documents = process_all_documents()
    print(f"\nπŸ“š Total documents to ingest: {len(documents)}")
    
    if len(documents) == 0:
        print("❌ No documents found. Check RawData directory.")
        return
    
    # Initialize Pinecone embeddings (FREE - no rate limits!)
    print("\nπŸ”— Initializing Pinecone Embeddings (multilingual-e5-large)...")
    embeddings = PineconeEmbeddings(
        model="multilingual-e5-large",
        pinecone_api_key=PINECONE_API_KEY
    )
    
    # Initialize Pinecone index (dimension 1024 for multilingual-e5-large)
    print("🌲 Initializing Pinecone...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Check if index exists
    existing_indexes = [idx.name for idx in pc.list_indexes()]
    
    if PINECONE_INDEX_NAME not in existing_indexes:
        print(f"πŸ”§ Creating Pinecone index: {PINECONE_INDEX_NAME}")
        pc.create_index(
            name=PINECONE_INDEX_NAME,
            dimension=1024,  # multilingual-e5-large dimension
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        # Wait for index to be ready
        print("   ⏳ Waiting for index to be ready...")
        time.sleep(10)
    else:
        print(f"βœ… Using existing Pinecone index: {PINECONE_INDEX_NAME}")
    
    # Upload to Pinecone in batches
    print("\n⬆️  Uploading to Pinecone Vector Store...")
    
    BATCH_SIZE = 100  # Pinecone embeddings have no rate limits
    total_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE
    
    for i in range(0, len(documents), BATCH_SIZE):
        batch = documents[i:i + BATCH_SIZE]
        batch_num = (i // BATCH_SIZE) + 1
        print(f"   πŸ“¦ Uploading batch {batch_num}/{total_batches} ({len(batch)} documents)...")
        
        if i == 0:
            # First batch creates the vector store
            vectorstore = PineconeVectorStore.from_documents(
                documents=batch,
                embedding=embeddings,
                index_name=PINECONE_INDEX_NAME
            )
        else:
            # Subsequent batches add to existing
            vectorstore.add_documents(batch)
    
    print("\nβœ… Ingestion Complete!")
    print(f"   Index: {PINECONE_INDEX_NAME}")
    print(f"   Documents: {len(documents)}")
    print("\nπŸ’‘ You can now start the API with: uvicorn app:app --reload --port 7860")


if __name__ == "__main__":
    main()