|
|
import os |
|
|
import warnings |
|
|
import pickle |
|
|
import faiss |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from PyPDF2 import PdfReader |
|
|
import glob |
|
|
from together import Together |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
TOGETHER_API_KEY = "81da53aa3044c7ebead342fb048f016a4e593a86928a783a6fdcc1e3883054e4" |
|
|
client = Together(api_key=TOGETHER_API_KEY) |
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer( |
|
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
|
use_auth_token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), |
|
|
) |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
"""Extract text from a PDF file.""" |
|
|
try: |
|
|
reader = PdfReader(pdf_path) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
text += page.extract_text() + "\n" |
|
|
return text.strip() |
|
|
except Exception as e: |
|
|
print(f"Error processing {pdf_path}: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def create_index(): |
|
|
"""Create and save the FAISS index and document metadata.""" |
|
|
|
|
|
os.makedirs("knowledge_base", exist_ok=True) |
|
|
|
|
|
|
|
|
pdf_files = glob.glob("Knowledge_base/*.pdf") |
|
|
|
|
|
if not pdf_files: |
|
|
raise ValueError("No PDF files found in Knowledge_base directory!") |
|
|
|
|
|
print(f"Found {len(pdf_files)} PDF files. Processing...") |
|
|
|
|
|
|
|
|
documents = [] |
|
|
filenames = [] |
|
|
|
|
|
for pdf_path in pdf_files: |
|
|
filename = os.path.basename(pdf_path) |
|
|
content = extract_text_from_pdf(pdf_path) |
|
|
|
|
|
if content: |
|
|
|
|
|
chunks = [content[i:i+1000] for i in range(0, len(content), 1000)] |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
if chunk.strip(): |
|
|
documents.append(chunk) |
|
|
filenames.append(f"{filename} (chunk {i+1})") |
|
|
|
|
|
if not documents: |
|
|
raise ValueError("No valid content extracted from PDFs!") |
|
|
|
|
|
print(f"Successfully processed {len(documents)} chunks from {len(pdf_files)} PDFs") |
|
|
|
|
|
|
|
|
print("Creating embeddings...") |
|
|
embeddings = embedding_model.encode(documents) |
|
|
|
|
|
|
|
|
dimension = embeddings.shape[1] |
|
|
index = faiss.IndexFlatIP(dimension) |
|
|
|
|
|
|
|
|
faiss.normalize_L2(embeddings) |
|
|
index.add(embeddings) |
|
|
|
|
|
|
|
|
print("Saving index and metadata...") |
|
|
faiss.write_index(index, "knowledge_base/faiss_index.bin") |
|
|
|
|
|
metadata = { |
|
|
"documents": documents, |
|
|
"filenames": filenames |
|
|
} |
|
|
|
|
|
with open("knowledge_base/metadata.pkl", "wb") as f: |
|
|
pickle.dump(metadata, f) |
|
|
|
|
|
print("Index and metadata saved successfully!") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
create_index() |