PranavRatnalikar commited on
Commit
eb93669
Β·
verified Β·
1 Parent(s): f92e87f

removed api key

Browse files
Files changed (1) hide show
  1. generate_index.py +66 -66
generate_index.py CHANGED
@@ -1,67 +1,67 @@
1
- import os
2
- import pdfplumber
3
- import pickle
4
- import faiss
5
- import numpy as np
6
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
- from langchain.vectorstores import FAISS
8
-
9
- # Configuration
10
- TEMPLATE_DIR = "dataset" # Folder containing template answer PDFs
11
- INDEX_NAME = "index" # Prefix for FAISS index files
12
- API_KEY = "AIzaSyArdn9_Uabo9q0aYmm4dxybVEb0tj7dlrk"
13
-
14
- def extract_text_from_pdf(pdf_path):
15
- """Extracts text from a single PDF file."""
16
- text = ""
17
- with pdfplumber.open(pdf_path) as pdf_reader:
18
- for page in pdf_reader.pages:
19
- text += page.extract_text() or "" # Handle NoneType
20
- return text.strip()
21
-
22
- def process_template_answers():
23
- """Extracts answers from template PDFs and stores them in FAISS."""
24
- template_answers = {}
25
-
26
- for file in os.listdir(TEMPLATE_DIR):
27
- if file.endswith(".pdf"):
28
- question_number = file.replace(".pdf", "").upper() # Extract question ID (e.g., 1A)
29
- file_path = os.path.join(TEMPLATE_DIR, file)
30
- extracted_text = extract_text_from_pdf(file_path)
31
- if extracted_text:
32
- template_answers[question_number] = extracted_text
33
-
34
- return template_answers
35
-
36
- def generate_faiss_index(api_key):
37
- """Creates FAISS index with Google AI Embeddings."""
38
- print("πŸ”„ Extracting template answers...")
39
- template_answers = process_template_answers()
40
-
41
- if not template_answers:
42
- print("❌ No valid template answers found.")
43
- return
44
-
45
- print("πŸ” Generating embeddings...")
46
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
47
-
48
- texts = list(template_answers.values())
49
- question_numbers = list(template_answers.keys())
50
-
51
- text_embeddings = np.array([embeddings.embed_query(text) for text in texts]).astype('float32')
52
-
53
- print("πŸ“ Creating FAISS index...")
54
- dimension = text_embeddings.shape[1]
55
- index = faiss.IndexFlatL2(dimension)
56
- index.add(text_embeddings)
57
-
58
- print("πŸ’Ύ Saving FAISS index...")
59
- faiss.write_index(index, f"{INDEX_NAME}.faiss")
60
-
61
- with open(f"{INDEX_NAME}.pkl", "wb") as f:
62
- pickle.dump(question_numbers, f)
63
-
64
- print("βœ… Indexing complete!")
65
-
66
- if __name__ == "__main__":
67
  generate_faiss_index(API_KEY)
 
1
+ import os
2
+ import pdfplumber
3
+ import pickle
4
+ import faiss
5
+ import numpy as np
6
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+
9
+ # Configuration
10
+ TEMPLATE_DIR = "dataset" # Folder containing template answer PDFs
11
+ INDEX_NAME = "index" # Prefix for FAISS index files
12
+ API_KEY = "" # add google api key
13
+
14
+ def extract_text_from_pdf(pdf_path):
15
+ """Extracts text from a single PDF file."""
16
+ text = ""
17
+ with pdfplumber.open(pdf_path) as pdf_reader:
18
+ for page in pdf_reader.pages:
19
+ text += page.extract_text() or "" # Handle NoneType
20
+ return text.strip()
21
+
22
+ def process_template_answers():
23
+ """Extracts answers from template PDFs and stores them in FAISS."""
24
+ template_answers = {}
25
+
26
+ for file in os.listdir(TEMPLATE_DIR):
27
+ if file.endswith(".pdf"):
28
+ question_number = file.replace(".pdf", "").upper() # Extract question ID (e.g., 1A)
29
+ file_path = os.path.join(TEMPLATE_DIR, file)
30
+ extracted_text = extract_text_from_pdf(file_path)
31
+ if extracted_text:
32
+ template_answers[question_number] = extracted_text
33
+
34
+ return template_answers
35
+
36
+ def generate_faiss_index(api_key):
37
+ """Creates FAISS index with Google AI Embeddings."""
38
+ print("πŸ”„ Extracting template answers...")
39
+ template_answers = process_template_answers()
40
+
41
+ if not template_answers:
42
+ print("❌ No valid template answers found.")
43
+ return
44
+
45
+ print("πŸ” Generating embeddings...")
46
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
47
+
48
+ texts = list(template_answers.values())
49
+ question_numbers = list(template_answers.keys())
50
+
51
+ text_embeddings = np.array([embeddings.embed_query(text) for text in texts]).astype('float32')
52
+
53
+ print("πŸ“ Creating FAISS index...")
54
+ dimension = text_embeddings.shape[1]
55
+ index = faiss.IndexFlatL2(dimension)
56
+ index.add(text_embeddings)
57
+
58
+ print("πŸ’Ύ Saving FAISS index...")
59
+ faiss.write_index(index, f"{INDEX_NAME}.faiss")
60
+
61
+ with open(f"{INDEX_NAME}.pkl", "wb") as f:
62
+ pickle.dump(question_numbers, f)
63
+
64
+ print("βœ… Indexing complete!")
65
+
66
+ if __name__ == "__main__":
67
  generate_faiss_index(API_KEY)