| import os |
| import pdfplumber |
| import pickle |
| import faiss |
| import numpy as np |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings |
| from langchain.vectorstores import FAISS |
|
|
| |
| TEMPLATE_DIR = "dataset" |
| INDEX_NAME = "index" |
| API_KEY = "" |
|
|
| def extract_text_from_pdf(pdf_path): |
| """Extracts text from a single PDF file.""" |
| text = "" |
| with pdfplumber.open(pdf_path) as pdf_reader: |
| for page in pdf_reader.pages: |
| text += page.extract_text() or "" |
| return text.strip() |
|
|
| def process_template_answers(): |
| """Extracts answers from template PDFs and stores them in FAISS.""" |
| template_answers = {} |
| |
| for file in os.listdir(TEMPLATE_DIR): |
| if file.endswith(".pdf"): |
| question_number = file.replace(".pdf", "").upper() |
| file_path = os.path.join(TEMPLATE_DIR, file) |
| extracted_text = extract_text_from_pdf(file_path) |
| if extracted_text: |
| template_answers[question_number] = extracted_text |
| |
| return template_answers |
|
|
| def generate_faiss_index(api_key): |
| """Creates FAISS index with Google AI Embeddings.""" |
| print("π Extracting template answers...") |
| template_answers = process_template_answers() |
| |
| if not template_answers: |
| print("β No valid template answers found.") |
| return |
|
|
| print("π Generating embeddings...") |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key) |
| |
| texts = list(template_answers.values()) |
| question_numbers = list(template_answers.keys()) |
|
|
| text_embeddings = np.array([embeddings.embed_query(text) for text in texts]).astype('float32') |
|
|
| print("π Creating FAISS index...") |
| dimension = text_embeddings.shape[1] |
| index = faiss.IndexFlatL2(dimension) |
| index.add(text_embeddings) |
|
|
| print("πΎ Saving FAISS index...") |
| faiss.write_index(index, f"{INDEX_NAME}.faiss") |
|
|
| with open(f"{INDEX_NAME}.pkl", "wb") as f: |
| pickle.dump(question_numbers, f) |
|
|
| print("β
Indexing complete!") |
|
|
| if __name__ == "__main__": |
| generate_faiss_index(API_KEY) |