NyayLens-API / src /indexing /create_sqlite_index.py
Sai Pranav Reddy
Clean lightweight deployment
968e24d
# """Create SQLite index for fast paragraph lookup"""
# import sqlite3
# import json
# from pathlib import Path
# from tqdm import tqdm
# def create_sqlite_index():
# print("Creating SQLite index...")
# db_path = Path("data/processed/indexed/paragraphs.db")
# db_path.parent.mkdir(parents=True, exist_ok=True)
# # Create database
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()
# # Create table
# cursor.execute("""
# CREATE TABLE IF NOT EXISTS paragraphs (
# id TEXT PRIMARY KEY,
# judgment_id TEXT,
# page_no INTEGER,
# text TEXT,
# char_count INTEGER,
# word_count INTEGER
# )
# """)
# cursor.execute("CREATE INDEX IF NOT EXISTS idx_judgment ON paragraphs(judgment_id)")
# # Load data
# index_file = Path("data/processed/indexed/paragraph_index.jsonl")
# with open(index_file, 'r', encoding='utf-8') as f:
# total = sum(1 for _ in f)
# with open(index_file, 'r', encoding='utf-8') as f:
# batch = []
# for line in tqdm(f, total=total, desc="Inserting"):
# p = json.loads(line)
# batch.append((
# p['id'], p['judgment_id'], p['page_no'],
# p['text'], p['char_count'], p['word_count']
# ))
# if len(batch) >= 1000:
# cursor.executemany(
# "INSERT OR REPLACE INTO paragraphs VALUES (?,?,?,?,?,?)",
# batch
# )
# batch = []
# if batch:
# cursor.executemany(
# "INSERT OR REPLACE INTO paragraphs VALUES (?,?,?,?,?,?)",
# batch
# )
# conn.commit()
# conn.close()
# print(f"βœ“ SQLite index created: {db_path}")
# if __name__ == "__main__":
# create_sqlite_index()
"""
Create SQLite index with section annotations
Source: paragraph_index_with_sections.jsonl
"""
import sqlite3
import json
from pathlib import Path
from tqdm import tqdm
INPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")
DB_PATH = Path("data/processed/indexed/paragraphs.db")
def create_sqlite_index():
print("=" * 70)
print("NyayLens – Creating SQLite Index (with Sections)")
print("=" * 70)
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
# Connect to SQLite
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Drop existing table (derived data β†’ safe to rebuild)
cursor.execute("DROP TABLE IF EXISTS paragraphs")
# Create table
cursor.execute("""
CREATE TABLE paragraphs (
id TEXT PRIMARY KEY,
judgment_id TEXT,
page_no INTEGER,
text TEXT,
char_count INTEGER,
word_count INTEGER,
section TEXT,
section_conf REAL
)
""")
# Create FTS5 virtual table for fast full-text search (BM25)
cursor.execute("DROP TABLE IF EXISTS paragraphs_fts")
cursor.execute("""
CREATE VIRTUAL TABLE paragraphs_fts USING fts5(
id UNINDEXED,
text,
tokenize='porter unicode61'
)
""")
# Indexes for fast lookup
cursor.execute("CREATE INDEX idx_judgment_id ON paragraphs(judgment_id)")
cursor.execute("CREATE INDEX idx_section ON paragraphs(section)")
cursor.execute("CREATE INDEX idx_judgment_section ON paragraphs(judgment_id, section)")
conn.commit()
# Count total records
with open(INPUT_INDEX, "r", encoding="utf-8") as f:
total = sum(1 for _ in f)
print(f"βœ“ Inserting {total:,} paragraphs")
# Insert data in batches
batch = []
BATCH_SIZE = 1000
with open(INPUT_INDEX, "r", encoding="utf-8") as f:
for line in tqdm(f, total=total, desc="Inserting"):
p = json.loads(line)
batch.append((
p["id"],
p["judgment_id"],
p.get("page_no", -1),
p["text"],
p.get("char_count", len(p["text"])),
p.get("word_count", len(p["text"].split())),
p.get("section", "unknown"),
p.get("section_conf", 0.0),
))
if len(batch) >= BATCH_SIZE:
cursor.executemany(
"""
INSERT INTO paragraphs
(id, judgment_id, page_no, text, char_count, word_count, section, section_conf)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
batch
)
# Insert into FTS5 table
fts_batch = [(b[0], b[3]) for b in batch]
cursor.executemany(
"INSERT INTO paragraphs_fts (id, text) VALUES (?, ?)",
fts_batch
)
batch.clear()
if batch:
cursor.executemany(
"""
INSERT INTO paragraphs
(id, judgment_id, page_no, text, char_count, word_count, section, section_conf)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
batch
)
fts_batch = [(b[0], b[3]) for b in batch]
cursor.executemany(
"INSERT INTO paragraphs_fts (id, text) VALUES (?, ?)",
fts_batch
)
conn.commit()
conn.close()
print("\nβœ“ SQLite index created successfully")
print(f"βœ“ Database path: {DB_PATH}")
print("=" * 70)
if __name__ == "__main__":
create_sqlite_index()