Spaces:
Build error
Build error
| import os | |
| import warnings | |
| import pickle | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from PyPDF2 import PdfReader | |
| import glob | |
| from together import Together | |
| warnings.filterwarnings("ignore") | |
| # Hardcoded Together API Key | |
| TOGETHER_API_KEY = "81da53aa3044c7ebead342fb048f016a4e593a86928a783a6fdcc1e3883054e4" | |
| client = Together(api_key=TOGETHER_API_KEY) | |
| # Initialize embedding model | |
| embedding_model = SentenceTransformer( | |
| "sentence-transformers/all-MiniLM-L6-v2", | |
| use_auth_token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), | |
| ) | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from a PDF file.""" | |
| try: | |
| reader = PdfReader(pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| except Exception as e: | |
| print(f"Error processing {pdf_path}: {str(e)}") | |
| return "" | |
| def create_index(): | |
| """Create and save the FAISS index and document metadata.""" | |
| # Create knowledge_base directory if it doesn't exist | |
| os.makedirs("knowledge_base", exist_ok=True) | |
| # Get all PDF files from knowledge_base directory | |
| pdf_files = glob.glob("Knowledge_base/*.pdf") | |
| if not pdf_files: | |
| raise ValueError("No PDF files found in Knowledge_base directory!") | |
| print(f"Found {len(pdf_files)} PDF files. Processing...") | |
| # Process documents | |
| documents = [] | |
| filenames = [] | |
| for pdf_path in pdf_files: | |
| filename = os.path.basename(pdf_path) | |
| content = extract_text_from_pdf(pdf_path) | |
| if content: | |
| # Split content into chunks (roughly 1000 characters each) | |
| chunks = [content[i:i+1000] for i in range(0, len(content), 1000)] | |
| for i, chunk in enumerate(chunks): | |
| if chunk.strip(): | |
| documents.append(chunk) | |
| filenames.append(f"{filename} (chunk {i+1})") | |
| if not documents: | |
| raise ValueError("No valid content extracted from PDFs!") | |
| print(f"Successfully processed {len(documents)} chunks from {len(pdf_files)} PDFs") | |
| # Create embeddings | |
| print("Creating embeddings...") | |
| embeddings = embedding_model.encode(documents) | |
| # Set up FAISS index | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dimension) | |
| # Normalize embeddings for cosine similarity | |
| faiss.normalize_L2(embeddings) | |
| index.add(embeddings) | |
| # Save the index and metadata | |
| print("Saving index and metadata...") | |
| faiss.write_index(index, "knowledge_base/faiss_index.bin") | |
| metadata = { | |
| "documents": documents, | |
| "filenames": filenames | |
| } | |
| with open("knowledge_base/metadata.pkl", "wb") as f: | |
| pickle.dump(metadata, f) | |
| print("Index and metadata saved successfully!") | |
| if __name__ == "__main__": | |
| create_index() |