Spaces:
Sleeping
Sleeping
| from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, generate_hypothetical_answer, query_llm_with_context | |
| import numpy as np | |
| import faiss | |
| import pickle | |
| import os | |
| import logging | |
| from helper import query_llm_with_context | |
| logging.basicConfig(level=logging.INFO) | |
| # Path for storing the FAISS index and document chunks | |
| index_path = "./faiss_index" | |
| chunks_path = "./document_chunks.pkl" | |
| pdf_path = 'C:\Git Projects\AnnualReport_rag\IBM.pdf' | |
| print('Extracting text from pdf...') | |
| pdf_text = extract_text_from_pdf(pdf_path) | |
| print('Chunking pdf...') | |
| chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100) | |
| print('Embedding chunks...') | |
| embeddings = embedding_function(chunks) | |
| print(f"Embeddings type: {type(embeddings)}") | |
| print(f"First embedding type: {type(embeddings[0])}") | |
| print(f"First embedding shape or length: {len(embeddings[0]) if hasattr(embeddings[0], '__len__') else 'unknown'}") | |
| # Convert embeddings to numpy array if they aren't already | |
| if not isinstance(embeddings, np.ndarray): | |
| print("Converting embeddings to numpy array...") | |
| embeddings = np.array(embeddings).astype('float32') | |
| # Get the dimension of the embeddings | |
| dimension = embeddings.shape[1] | |
| print(f"Embedding dimension: {dimension}") | |
| # Initialize FAISS index | |
| print('Initializing FAISS index...') | |
| index = faiss.IndexFlatL2(dimension) # L2 distance for similarity search | |
| # Add vectors to the index | |
| print('Adding vectors to FAISS index...') | |
| index.add(embeddings) | |
| # Save the index | |
| print('Saving FAISS index...') | |
| faiss.write_index(index, index_path) | |
| # Save the document chunks for retrieval | |
| print('Saving document chunks...') | |
| with open(chunks_path, 'wb') as f: | |
| pickle.dump(chunks, f) | |
| print(f"Total vectors in index: {index.ntotal}") | |
| def retrieve_documents(query, n_results=5): | |
| # Generate embedding for the query | |
| query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32') | |
| # Search the index | |
| distances, indices = index.search(query_embedding, n_results) | |
| # Get the documents | |
| documents = [chunks[i] for i in indices[0]] | |
| # Convert distances to similarity scores (L2 distance: lower is better) | |
| # Normalize distances to [0, 1] range where 1 is most similar | |
| max_distance = np.max(distances) | |
| similarity_scores = [1 - (dist / max_distance) for dist in distances[0]] | |
| return documents, similarity_scores | |
| # Test the retrieval | |
| query="how has the profitability of the company been in last five years" | |
| print('Retrieving documents...') | |
| general_docs, general_scores = retrieve_documents(query, n_results=15) | |
| print(f"Number of docs returned for general query: {len(general_docs)}") | |
| # Print the results | |
| # for i, (doc, score) in enumerate(zip(general_docs, general_scores)): | |
| # print(f"\nResult {i+1} (Score: {score:.4f}):") | |
| # print(f"{doc[:200]}...") | |
| new_query=query+generate_hypothetical_answer(query) | |
| combined_context=retrieve_documents(new_query, n_results=15) | |
| answer = query_llm_with_context(query, combined_context, top_n=3) | |
| print('final_response:{answer}') | |