Subhakanta
Initial commit without data folder
4787e22
import os
import re
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Load environment variables from .env file
load_dotenv()
# Define the path for the FAISS vector store
DB_FAISS_PATH = 'vectorStore'
def clean_text(text):
"""Clean messy headers/footers and normalize spacing."""
text = re.sub(r'\n\s*\n', '\n\n', text) # collapse multiple newlines
lines = text.split('\n')
cleaned_lines = []
for line in lines:
if sum(c.isalpha() for c in line) > 5: # keep if more than 5 letters
cleaned_lines.append(line)
text = '\n'.join(cleaned_lines)
text = re.sub(r'\s+', ' ', text).strip() # normalize spaces
return text
def load_documents():
"""Manually load PDF and text documents from the 'data/' folder with proper encoding."""
data_dir = '../data'
documents = []
for root, _, files in os.walk(data_dir):
for file in files:
file_path = os.path.join(root, file)
if file.lower().endswith('.pdf'):
loader = PyPDFLoader(file_path)
print(f"Loading PDF {file_path}")
documents.extend(loader.load())
elif file.lower().endswith('.txt'):
print(f"Loading TXT {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
documents.append(Document(page_content=text, metadata={"source": file_path}))
except UnicodeDecodeError as e:
print(f"⚠ Skipping {file_path} due to encoding error: {e}")
else:
continue
return documents
def create_vector_db():
print("Step 1: Loading documents from the 'data/' directory...")
documents = load_documents()
if not documents:
print("No documents found in the 'data' directory. Exiting.")
return
print(f"Loaded {len(documents)} document(s).")
print("\nStep 2: Cleaning the text content...")
for doc in documents:
doc.page_content = clean_text(doc.page_content)
print("Text cleaning complete.")
print("\nStep 3: Splitting into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks.")
print("\nStep 4: Creating embeddings with HuggingFace...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Step 5: Building FAISS index...")
db = FAISS.from_documents(chunks, embeddings)
db.save_local(DB_FAISS_PATH)
print(f"\n✅ Ingestion complete! Vector store saved at '{DB_FAISS_PATH}'")
if __name__ == "__main__":
create_vector_db()