Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| #!/usr/bin/env python3 | |
| # scripts/create_eu_ai_act_vectorstore.py | |
| """ | |
| Script to create and save a vectorstore from the EU AI Act PDF. | |
| This creates a FAISS vectorstore that can be loaded quickly in the main app. | |
| """ | |
| import os | |
| import logging | |
| from pathlib import Path | |
| import pickle | |
| from typing import Optional | |
| import dotenv | |
| # Import config | |
| from config import config | |
| # PDF processing | |
| import PyPDF2 | |
| # LangChain components | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.schema import Document | |
| # Load environment variables | |
| dotenv.load_dotenv() | |
| # Create logs directory if it doesn't exist | |
| os.makedirs("data_updating_scripts/logs", exist_ok=True) | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/eu_vectorstore.log")], | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """Extract text from PDF file.""" | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| logger.info(f"Processing {len(pdf_reader.pages)} pages from {pdf_path}") | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| try: | |
| page_text = page.extract_text() | |
| text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}" | |
| except Exception as e: | |
| logger.warning(f"Error extracting text from page {page_num + 1}: {e}") | |
| continue | |
| logger.info(f"Extracted {len(text)} characters from PDF") | |
| return text | |
| except Exception as e: | |
| logger.error(f"Error reading PDF {pdf_path}: {e}") | |
| raise e | |
| def create_eu_ai_act_documents(text_content: str) -> list: | |
| """Convert EU AI Act text to Document objects with metadata.""" | |
| try: | |
| # Initialize text splitter with appropriate settings for legal documents | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, # Larger chunks for legal text | |
| chunk_overlap=200, # More overlap for context preservation | |
| length_function=len, | |
| separators=["\n\n", "\n", ". ", " ", ""] | |
| ) | |
| # Create initial document | |
| doc = Document( | |
| page_content=text_content, | |
| metadata={ | |
| 'source': 'EU AI Act', | |
| 'document_type': 'regulation', | |
| 'jurisdiction': 'European Union', | |
| 'title': 'Regulation (EU) 2024/1689 on Artificial Intelligence (AI Act)' | |
| } | |
| ) | |
| # Split into chunks | |
| splits = text_splitter.split_documents([doc]) | |
| # Add chunk-specific metadata | |
| for i, split in enumerate(splits): | |
| split.metadata.update({ | |
| 'chunk_id': i, | |
| 'total_chunks': len(splits) | |
| }) | |
| logger.info(f"Created {len(splits)} document chunks") | |
| return splits | |
| except Exception as e: | |
| logger.error(f"Error creating documents: {e}") | |
| raise e | |
| def create_and_save_eu_vectorstore( | |
| pdf_path: str = "data_updating_scripts/eu-ai-act.pdf", | |
| vectorstore_path: str = "data/eu_ai_act_vectorstore", | |
| openai_api_key: Optional[str] = None | |
| ) -> bool: | |
| """ | |
| Create FAISS vectorstore from EU AI Act PDF and save it locally. | |
| Args: | |
| pdf_path: Path to the EU AI Act PDF file | |
| vectorstore_path: Directory to save the vectorstore | |
| openai_api_key: OpenAI API key (if not provided, uses environment variable) | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| # Check if PDF exists | |
| if not Path(pdf_path).exists(): | |
| logger.error(f"PDF file not found: {pdf_path}") | |
| return False | |
| # Get API key | |
| api_key = openai_api_key or config.OPENAI_API_KEY | |
| if not api_key: | |
| logger.error("OpenAI API key not found") | |
| return False | |
| logger.info("Starting EU AI Act vectorstore creation...") | |
| # Extract text from PDF | |
| logger.info("Extracting text from PDF...") | |
| text_content = extract_text_from_pdf(pdf_path) | |
| if not text_content or len(text_content) < 1000: | |
| logger.error("Insufficient text extracted from PDF") | |
| return False | |
| # Create documents | |
| logger.info("Creating document chunks...") | |
| documents = create_eu_ai_act_documents(text_content) | |
| if not documents: | |
| logger.error("No documents created") | |
| return False | |
| # Initialize embeddings | |
| logger.info("Initializing embeddings...") | |
| embeddings = OpenAIEmbeddings( | |
| api_key=api_key, | |
| model="text-embedding-3-small" | |
| ) | |
| # Create vectorstore | |
| logger.info("Creating FAISS vectorstore...") | |
| vectorstore = FAISS.from_documents(documents, embeddings) | |
| # Create directory if it doesn't exist | |
| Path(vectorstore_path).mkdir(exist_ok=True) | |
| # Save vectorstore | |
| logger.info(f"Saving vectorstore to {vectorstore_path}...") | |
| vectorstore.save_local(vectorstore_path) | |
| # Save metadata | |
| metadata = { | |
| 'pdf_path': pdf_path, | |
| 'total_chunks': len(documents), | |
| 'text_length': len(text_content), | |
| 'embedding_model': 'text-embedding-3-small', | |
| 'chunk_size': 1500, | |
| 'chunk_overlap': 200 | |
| } | |
| metadata_path = Path(vectorstore_path) / "metadata.pickle" | |
| with open(metadata_path, 'wb') as f: | |
| pickle.dump(metadata, f) | |
| logger.info(f"✅ EU AI Act vectorstore created successfully!") | |
| logger.info(f" - Total chunks: {len(documents)}") | |
| logger.info(f" - Text length: {len(text_content):,} characters") | |
| logger.info(f" - Saved to: {vectorstore_path}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error creating EU AI Act vectorstore: {e}") | |
| return False | |
| def load_eu_vectorstore( | |
| vectorstore_path: str = "eu_ai_act_vectorstore", | |
| openai_api_key: Optional[str] = None | |
| ) -> Optional[FAISS]: | |
| """ | |
| Load the EU AI Act vectorstore from disk. | |
| Args: | |
| vectorstore_path: Path to the saved vectorstore | |
| openai_api_key: OpenAI API key | |
| Returns: | |
| FAISS vectorstore or None if failed | |
| """ | |
| try: | |
| if not Path(vectorstore_path).exists(): | |
| logger.error(f"Vectorstore not found: {vectorstore_path}") | |
| return None | |
| # Get API key | |
| api_key = openai_api_key or config.OPENAI_API_KEY | |
| if not api_key: | |
| logger.error("OpenAI API key not found") | |
| return None | |
| # Initialize embeddings | |
| embeddings = OpenAIEmbeddings( | |
| api_key=api_key, | |
| model="text-embedding-3-small" | |
| ) | |
| # Load vectorstore | |
| vectorstore = FAISS.load_local( | |
| vectorstore_path, | |
| embeddings, | |
| allow_dangerous_deserialization=True # Required for loading pickled objects | |
| ) | |
| logger.info(f"✅ EU AI Act vectorstore loaded from {vectorstore_path}") | |
| return vectorstore | |
| except Exception as e: | |
| logger.error(f"Error loading EU AI Act vectorstore: {e}") | |
| return None | |
| def get_vectorstore_info(vectorstore_path: str = "data/eu_ai_act_vectorstore") -> dict: | |
| """Get information about the saved vectorstore.""" | |
| try: | |
| metadata_path = Path(vectorstore_path) / "metadata.pickle" | |
| if metadata_path.exists(): | |
| with open(metadata_path, 'rb') as f: | |
| metadata = pickle.load(f) | |
| return metadata | |
| else: | |
| return {"error": "Metadata not found"} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| if __name__ == "__main__": | |
| # Create the vectorstore | |
| success = create_and_save_eu_vectorstore() | |
| if success: | |
| # Display info | |
| info = get_vectorstore_info() | |
| print("\n" + "="*50) | |
| print("EU AI Act Vectorstore Information:") | |
| print("="*50) | |
| for key, value in info.items(): | |
| if key != 'error': | |
| print(f"{key}: {value}") | |
| print("="*50) | |
| else: | |
| print("❌ Failed to create EU AI Act vectorstore") | |
| exit(1) |