Spaces:

VAILL
/

legislation-tracker

Running on CPU Upgrade

App Files Files Community

legislation-tracker / data_updating_scripts /eu_vectorstore.py

ramanna

Upload 30 files

b5a9373 verified about 1 month ago

raw

history blame contribute delete

8.79 kB

	#!/usr/bin/env python3
	# scripts/create_eu_ai_act_vectorstore.py

	"""
	Script to create and save a vectorstore from the EU AI Act PDF.
	This creates a FAISS vectorstore that can be loaded quickly in the main app.
	"""

	import os
	import logging
	from pathlib import Path
	import pickle
	from typing import Optional
	import dotenv

	# Import config
	from config import config

	# PDF processing
	import PyPDF2

	# LangChain components
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.schema import Document

	# Load environment variables
	dotenv.load_dotenv()

	# Create logs directory if it doesn't exist
	os.makedirs("data_updating_scripts/logs", exist_ok=True)

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/eu_vectorstore.log")],
	)

	logger = logging.getLogger(__name__)

	def extract_text_from_pdf(pdf_path: str) -> str:
	"""Extract text from PDF file."""
	try:
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ""

	logger.info(f"Processing {len(pdf_reader.pages)} pages from {pdf_path}")

	for page_num, page in enumerate(pdf_reader.pages):
	try:
	page_text = page.extract_text()
	text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
	except Exception as e:
	logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
	continue

	logger.info(f"Extracted {len(text)} characters from PDF")
	return text

	except Exception as e:
	logger.error(f"Error reading PDF {pdf_path}: {e}")
	raise e

	def create_eu_ai_act_documents(text_content: str) -> list:
	"""Convert EU AI Act text to Document objects with metadata."""
	try:
	# Initialize text splitter with appropriate settings for legal documents
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1500, # Larger chunks for legal text
	chunk_overlap=200, # More overlap for context preservation
	length_function=len,
	separators=["\n\n", "\n", ". ", " ", ""]
	)

	# Create initial document
	doc = Document(
	page_content=text_content,
	metadata={
	'source': 'EU AI Act',
	'document_type': 'regulation',
	'jurisdiction': 'European Union',
	'title': 'Regulation (EU) 2024/1689 on Artificial Intelligence (AI Act)'
	}
	)

	# Split into chunks
	splits = text_splitter.split_documents([doc])

	# Add chunk-specific metadata
	for i, split in enumerate(splits):
	split.metadata.update({
	'chunk_id': i,
	'total_chunks': len(splits)
	})

	logger.info(f"Created {len(splits)} document chunks")
	return splits

	except Exception as e:
	logger.error(f"Error creating documents: {e}")
	raise e

	def create_and_save_eu_vectorstore(
	pdf_path: str = "data_updating_scripts/eu-ai-act.pdf",
	vectorstore_path: str = "data/eu_ai_act_vectorstore",
	openai_api_key: Optional[str] = None
	) -> bool:
	"""
	Create FAISS vectorstore from EU AI Act PDF and save it locally.

	Args:
	pdf_path: Path to the EU AI Act PDF file
	vectorstore_path: Directory to save the vectorstore
	openai_api_key: OpenAI API key (if not provided, uses environment variable)

	Returns:
	bool: True if successful, False otherwise
	"""
	try:
	# Check if PDF exists
	if not Path(pdf_path).exists():
	logger.error(f"PDF file not found: {pdf_path}")
	return False

	# Get API key
	api_key = openai_api_key or config.OPENAI_API_KEY
	if not api_key:
	logger.error("OpenAI API key not found")
	return False

	logger.info("Starting EU AI Act vectorstore creation...")

	# Extract text from PDF
	logger.info("Extracting text from PDF...")
	text_content = extract_text_from_pdf(pdf_path)

	if not text_content or len(text_content) < 1000:
	logger.error("Insufficient text extracted from PDF")
	return False

	# Create documents
	logger.info("Creating document chunks...")
	documents = create_eu_ai_act_documents(text_content)

	if not documents:
	logger.error("No documents created")
	return False

	# Initialize embeddings
	logger.info("Initializing embeddings...")
	embeddings = OpenAIEmbeddings(
	api_key=api_key,
	model="text-embedding-3-small"
	)

	# Create vectorstore
	logger.info("Creating FAISS vectorstore...")
	vectorstore = FAISS.from_documents(documents, embeddings)

	# Create directory if it doesn't exist
	Path(vectorstore_path).mkdir(exist_ok=True)

	# Save vectorstore
	logger.info(f"Saving vectorstore to {vectorstore_path}...")
	vectorstore.save_local(vectorstore_path)

	# Save metadata
	metadata = {
	'pdf_path': pdf_path,
	'total_chunks': len(documents),
	'text_length': len(text_content),
	'embedding_model': 'text-embedding-3-small',
	'chunk_size': 1500,
	'chunk_overlap': 200
	}

	metadata_path = Path(vectorstore_path) / "metadata.pickle"
	with open(metadata_path, 'wb') as f:
	pickle.dump(metadata, f)

	logger.info(f"✅ EU AI Act vectorstore created successfully!")
	logger.info(f" - Total chunks: {len(documents)}")
	logger.info(f" - Text length: {len(text_content):,} characters")
	logger.info(f" - Saved to: {vectorstore_path}")

	return True

	except Exception as e:
	logger.error(f"Error creating EU AI Act vectorstore: {e}")
	return False

	def load_eu_vectorstore(
	vectorstore_path: str = "eu_ai_act_vectorstore",
	openai_api_key: Optional[str] = None
	) -> Optional[FAISS]:
	"""
	Load the EU AI Act vectorstore from disk.

	Args:
	vectorstore_path: Path to the saved vectorstore
	openai_api_key: OpenAI API key

	Returns:
	FAISS vectorstore or None if failed
	"""
	try:
	if not Path(vectorstore_path).exists():
	logger.error(f"Vectorstore not found: {vectorstore_path}")
	return None

	# Get API key
	api_key = openai_api_key or config.OPENAI_API_KEY
	if not api_key:
	logger.error("OpenAI API key not found")
	return None

	# Initialize embeddings
	embeddings = OpenAIEmbeddings(
	api_key=api_key,
	model="text-embedding-3-small"
	)

	# Load vectorstore
	vectorstore = FAISS.load_local(
	vectorstore_path,
	embeddings,
	allow_dangerous_deserialization=True # Required for loading pickled objects
	)

	logger.info(f"✅ EU AI Act vectorstore loaded from {vectorstore_path}")
	return vectorstore

	except Exception as e:
	logger.error(f"Error loading EU AI Act vectorstore: {e}")
	return None

	def get_vectorstore_info(vectorstore_path: str = "data/eu_ai_act_vectorstore") -> dict:
	"""Get information about the saved vectorstore."""
	try:
	metadata_path = Path(vectorstore_path) / "metadata.pickle"
	if metadata_path.exists():
	with open(metadata_path, 'rb') as f:
	metadata = pickle.load(f)
	return metadata
	else:
	return {"error": "Metadata not found"}
	except Exception as e:
	return {"error": str(e)}

	if __name__ == "__main__":
	# Create the vectorstore
	success = create_and_save_eu_vectorstore()

	if success:
	# Display info
	info = get_vectorstore_info()
	print("\n" + "="*50)
	print("EU AI Act Vectorstore Information:")
	print("="*50)
	for key, value in info.items():
	if key != 'error':
	print(f"{key}: {value}")
	print("="*50)
	else:
	print("❌ Failed to create EU AI Act vectorstore")
	exit(1)