re-evaluation_model / generate_index.py
PranavRatnalikar's picture
removed api key
eb93669 verified
import os
import pdfplumber
import pickle
import faiss
import numpy as np
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
# Configuration
TEMPLATE_DIR = "dataset" # Folder containing template answer PDFs
INDEX_NAME = "index" # Prefix for FAISS index files
API_KEY = "" # add google api key
def extract_text_from_pdf(pdf_path):
"""Extracts text from a single PDF file."""
text = ""
with pdfplumber.open(pdf_path) as pdf_reader:
for page in pdf_reader.pages:
text += page.extract_text() or "" # Handle NoneType
return text.strip()
def process_template_answers():
"""Extracts answers from template PDFs and stores them in FAISS."""
template_answers = {}
for file in os.listdir(TEMPLATE_DIR):
if file.endswith(".pdf"):
question_number = file.replace(".pdf", "").upper() # Extract question ID (e.g., 1A)
file_path = os.path.join(TEMPLATE_DIR, file)
extracted_text = extract_text_from_pdf(file_path)
if extracted_text:
template_answers[question_number] = extracted_text
return template_answers
def generate_faiss_index(api_key):
"""Creates FAISS index with Google AI Embeddings."""
print("πŸ”„ Extracting template answers...")
template_answers = process_template_answers()
if not template_answers:
print("❌ No valid template answers found.")
return
print("πŸ” Generating embeddings...")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
texts = list(template_answers.values())
question_numbers = list(template_answers.keys())
text_embeddings = np.array([embeddings.embed_query(text) for text in texts]).astype('float32')
print("πŸ“ Creating FAISS index...")
dimension = text_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(text_embeddings)
print("πŸ’Ύ Saving FAISS index...")
faiss.write_index(index, f"{INDEX_NAME}.faiss")
with open(f"{INDEX_NAME}.pkl", "wb") as f:
pickle.dump(question_numbers, f)
print("βœ… Indexing complete!")
if __name__ == "__main__":
generate_faiss_index(API_KEY)