ramanna's picture
Upload 30 files
b5a9373 verified
#!/usr/bin/env python3
# scripts/create_eu_ai_act_vectorstore.py
"""
Script to create and save a vectorstore from the EU AI Act PDF.
This creates a FAISS vectorstore that can be loaded quickly in the main app.
"""
import os
import logging
from pathlib import Path
import pickle
from typing import Optional
import dotenv
# Import config
from config import config
# PDF processing
import PyPDF2
# LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
# Load environment variables
dotenv.load_dotenv()
# Create logs directory if it doesn't exist
os.makedirs("data_updating_scripts/logs", exist_ok=True)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/eu_vectorstore.log")],
)
logger = logging.getLogger(__name__)
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extract text from PDF file."""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
logger.info(f"Processing {len(pdf_reader.pages)} pages from {pdf_path}")
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
except Exception as e:
logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
continue
logger.info(f"Extracted {len(text)} characters from PDF")
return text
except Exception as e:
logger.error(f"Error reading PDF {pdf_path}: {e}")
raise e
def create_eu_ai_act_documents(text_content: str) -> list:
"""Convert EU AI Act text to Document objects with metadata."""
try:
# Initialize text splitter with appropriate settings for legal documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, # Larger chunks for legal text
chunk_overlap=200, # More overlap for context preservation
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""]
)
# Create initial document
doc = Document(
page_content=text_content,
metadata={
'source': 'EU AI Act',
'document_type': 'regulation',
'jurisdiction': 'European Union',
'title': 'Regulation (EU) 2024/1689 on Artificial Intelligence (AI Act)'
}
)
# Split into chunks
splits = text_splitter.split_documents([doc])
# Add chunk-specific metadata
for i, split in enumerate(splits):
split.metadata.update({
'chunk_id': i,
'total_chunks': len(splits)
})
logger.info(f"Created {len(splits)} document chunks")
return splits
except Exception as e:
logger.error(f"Error creating documents: {e}")
raise e
def create_and_save_eu_vectorstore(
pdf_path: str = "data_updating_scripts/eu-ai-act.pdf",
vectorstore_path: str = "data/eu_ai_act_vectorstore",
openai_api_key: Optional[str] = None
) -> bool:
"""
Create FAISS vectorstore from EU AI Act PDF and save it locally.
Args:
pdf_path: Path to the EU AI Act PDF file
vectorstore_path: Directory to save the vectorstore
openai_api_key: OpenAI API key (if not provided, uses environment variable)
Returns:
bool: True if successful, False otherwise
"""
try:
# Check if PDF exists
if not Path(pdf_path).exists():
logger.error(f"PDF file not found: {pdf_path}")
return False
# Get API key
api_key = openai_api_key or config.OPENAI_API_KEY
if not api_key:
logger.error("OpenAI API key not found")
return False
logger.info("Starting EU AI Act vectorstore creation...")
# Extract text from PDF
logger.info("Extracting text from PDF...")
text_content = extract_text_from_pdf(pdf_path)
if not text_content or len(text_content) < 1000:
logger.error("Insufficient text extracted from PDF")
return False
# Create documents
logger.info("Creating document chunks...")
documents = create_eu_ai_act_documents(text_content)
if not documents:
logger.error("No documents created")
return False
# Initialize embeddings
logger.info("Initializing embeddings...")
embeddings = OpenAIEmbeddings(
api_key=api_key,
model="text-embedding-3-small"
)
# Create vectorstore
logger.info("Creating FAISS vectorstore...")
vectorstore = FAISS.from_documents(documents, embeddings)
# Create directory if it doesn't exist
Path(vectorstore_path).mkdir(exist_ok=True)
# Save vectorstore
logger.info(f"Saving vectorstore to {vectorstore_path}...")
vectorstore.save_local(vectorstore_path)
# Save metadata
metadata = {
'pdf_path': pdf_path,
'total_chunks': len(documents),
'text_length': len(text_content),
'embedding_model': 'text-embedding-3-small',
'chunk_size': 1500,
'chunk_overlap': 200
}
metadata_path = Path(vectorstore_path) / "metadata.pickle"
with open(metadata_path, 'wb') as f:
pickle.dump(metadata, f)
logger.info(f"✅ EU AI Act vectorstore created successfully!")
logger.info(f" - Total chunks: {len(documents)}")
logger.info(f" - Text length: {len(text_content):,} characters")
logger.info(f" - Saved to: {vectorstore_path}")
return True
except Exception as e:
logger.error(f"Error creating EU AI Act vectorstore: {e}")
return False
def load_eu_vectorstore(
vectorstore_path: str = "eu_ai_act_vectorstore",
openai_api_key: Optional[str] = None
) -> Optional[FAISS]:
"""
Load the EU AI Act vectorstore from disk.
Args:
vectorstore_path: Path to the saved vectorstore
openai_api_key: OpenAI API key
Returns:
FAISS vectorstore or None if failed
"""
try:
if not Path(vectorstore_path).exists():
logger.error(f"Vectorstore not found: {vectorstore_path}")
return None
# Get API key
api_key = openai_api_key or config.OPENAI_API_KEY
if not api_key:
logger.error("OpenAI API key not found")
return None
# Initialize embeddings
embeddings = OpenAIEmbeddings(
api_key=api_key,
model="text-embedding-3-small"
)
# Load vectorstore
vectorstore = FAISS.load_local(
vectorstore_path,
embeddings,
allow_dangerous_deserialization=True # Required for loading pickled objects
)
logger.info(f"✅ EU AI Act vectorstore loaded from {vectorstore_path}")
return vectorstore
except Exception as e:
logger.error(f"Error loading EU AI Act vectorstore: {e}")
return None
def get_vectorstore_info(vectorstore_path: str = "data/eu_ai_act_vectorstore") -> dict:
"""Get information about the saved vectorstore."""
try:
metadata_path = Path(vectorstore_path) / "metadata.pickle"
if metadata_path.exists():
with open(metadata_path, 'rb') as f:
metadata = pickle.load(f)
return metadata
else:
return {"error": "Metadata not found"}
except Exception as e:
return {"error": str(e)}
if __name__ == "__main__":
# Create the vectorstore
success = create_and_save_eu_vectorstore()
if success:
# Display info
info = get_vectorstore_info()
print("\n" + "="*50)
print("EU AI Act Vectorstore Information:")
print("="*50)
for key, value in info.items():
if key != 'error':
print(f"{key}: {value}")
print("="*50)
else:
print("❌ Failed to create EU AI Act vectorstore")
exit(1)