Spaces:
Sleeping
Sleeping
| import os | |
| from collections import defaultdict | |
| import tiktoken | |
| import pickle | |
| import shutil | |
| import json | |
| import time | |
| import numpy as np | |
| from pathlib import Path | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_core.documents import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from transformers import AutoModel, AutoTokenizer | |
| import torch | |
| import torch.nn.functional as F | |
| from langchain_community.vectorstores import Qdrant | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.models import Distance, VectorParams | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| def tiktoken_len(text): | |
| """Count tokens using the gpt-4o-mini tokenizer""" | |
| tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text) | |
| return len(tokens) | |
| def add_page_info_to_splits(splits): | |
| """Process splits to add page info based on character position""" | |
| for split in splits: | |
| # Get the start position of this chunk | |
| start_pos = split.metadata.get("start_index", 0) | |
| end_pos = start_pos + len(split.page_content) | |
| # Find which page this chunk belongs to | |
| if "page_ranges" in split.metadata: | |
| for page_range in split.metadata["page_ranges"]: | |
| # If chunk significantly overlaps with this page range | |
| if (start_pos <= page_range["end"] and | |
| end_pos >= page_range["start"]): | |
| # Use this page number | |
| split.metadata["page"] = page_range["page"] | |
| break | |
| return splits | |
| def clean_directory(directory_path): | |
| """Clean a directory by removing all files and subdirectories""" | |
| path = Path(directory_path) | |
| if path.exists(): | |
| print(f"Cleaning directory: {directory_path}") | |
| shutil.rmtree(path) | |
| # Wait a moment to ensure OS releases the directory handles | |
| time.sleep(1) | |
| path.mkdir(parents=True, exist_ok=True) | |
| print(f"Created clean directory: {directory_path}") | |
| class ArcticEmbedder: | |
| def __init__(self, model_name): | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| self.model = AutoModel.from_pretrained(model_name) | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.model.to(self.device) | |
| def _mean_pooling(self, model_output, attention_mask): | |
| token_embeddings = model_output.last_hidden_state | |
| input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
| return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
| def encode(self, texts, batch_size=32): | |
| all_embeddings = [] | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i+batch_size] | |
| encoded_input = self.tokenizer( | |
| batch, | |
| padding=True, | |
| truncation=True, | |
| return_tensors="pt" | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| model_output = self.model(**encoded_input) | |
| batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask']) | |
| batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1) | |
| all_embeddings.append(batch_embeddings.cpu().numpy()) | |
| return np.concatenate(all_embeddings) | |
| def process_pdfs(): | |
| """Process PDFs and create vectorstore""" | |
| print("Processing PDFs...") | |
| # Create processed data directory if it doesn't exist (clean it if it does) | |
| processed_data_dir = Path("data/processed_data") | |
| clean_directory(processed_data_dir) | |
| # Load all PDF documents (each page as a separate document) | |
| pdf_path = "notebook_version_clean/data/" | |
| print(f"Loading PDFs from: {pdf_path}") | |
| loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader) | |
| all_docs = loader.load() | |
| print(f"Loaded {len(all_docs)} document pages.") | |
| # Create a mapping of merged document chunks back to original pages | |
| docs_by_source = defaultdict(list) | |
| # Group documents by their source file | |
| for doc in all_docs: | |
| source = doc.metadata.get("source", "") | |
| docs_by_source[source].append(doc) | |
| # Merge pages from the same PDF but track page ranges | |
| merged_docs = [] | |
| for source, source_docs in docs_by_source.items(): | |
| # Sort by page number if available | |
| source_docs.sort(key=lambda x: x.metadata.get("page", 0)) | |
| # Get just the filename (no path) | |
| filename = os.path.basename(source) | |
| # Merge the content | |
| merged_content = "" | |
| page_ranges = [] | |
| for doc in source_docs: | |
| # Get the page number (1-indexed for human readability) | |
| page_num = doc.metadata.get("page", 0) + 1 | |
| # Add a separator between pages for clarity | |
| if merged_content: | |
| merged_content += "\n\n" | |
| # Record where this page's content starts in the merged document | |
| start_pos = len(merged_content) | |
| merged_content += doc.page_content | |
| end_pos = len(merged_content) | |
| # Store the mapping of character ranges to original page numbers | |
| page_ranges.append({ | |
| "start": start_pos, | |
| "end": end_pos, | |
| "page": page_num, | |
| "source": filename | |
| }) | |
| # Create merged metadata that includes page mapping information | |
| merged_metadata = { | |
| "source": filename, | |
| "title": filename, | |
| "page_count": len(source_docs), | |
| "merged": True, | |
| "page_ranges": page_ranges # Store the page ranges for later reference | |
| } | |
| # Create a new document with the merged content | |
| merged_doc = Document(page_content=merged_content, metadata=merged_metadata) | |
| merged_docs.append(merged_doc) | |
| print(f"Created {len(merged_docs)} merged documents.") | |
| # Split documents | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=150, | |
| chunk_overlap=100, | |
| length_function=tiktoken_len, | |
| add_start_index=True | |
| ) | |
| # Split and then process to add page information | |
| raw_splits = text_splitter.split_documents(merged_docs) | |
| split_chunks = add_page_info_to_splits(raw_splits) | |
| print(f"Created {len(split_chunks)} chunks.") | |
| # Save chunks for later use | |
| with open(processed_data_dir / "chunks.pkl", "wb") as f: | |
| pickle.dump(split_chunks, f) | |
| # Initialize custom embedding model | |
| try: | |
| embedding_model = ArcticEmbedder("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec") | |
| print("Successfully loaded ArcticEmbedder model") | |
| except Exception as e: | |
| print(f"Error loading model: {str(e)}") | |
| raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}") | |
| print("Embedding document chunks (this may take a while)...") | |
| # Create a dictionary to store documents and their embeddings | |
| embedded_docs = [] | |
| # Embed in batches to avoid API rate limits | |
| batch_size = 50 | |
| for i in range(0, len(split_chunks), batch_size): | |
| batch = split_chunks[i:i+batch_size] | |
| # Extract text | |
| texts = [doc.page_content for doc in batch] | |
| # Get embeddings | |
| embeddings = embedding_model.encode(texts) | |
| # Store with metadata | |
| for j, doc in enumerate(batch): | |
| embedded_docs.append({ | |
| "id": i + j, | |
| "text": doc.page_content, | |
| "metadata": doc.metadata, | |
| "embedding": embeddings[j] | |
| }) | |
| # Print progress | |
| print(f"Embedded {min(i+batch_size, len(split_chunks))}/{len(split_chunks)} chunks") | |
| # Save the embedded docs for later use | |
| with open(processed_data_dir / "embedded_docs.pkl", "wb") as f: | |
| pickle.dump(embedded_docs, f) | |
| print("Processing complete. All data saved to data/processed_data/") | |
| if __name__ == "__main__": | |
| process_pdfs() |