|
|
import os
|
|
|
import re
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
from langchain_core.documents import Document
|
|
|
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
from langchain_community.vectorstores import FAISS
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
|
|
|
DB_FAISS_PATH = 'vectorStore'
|
|
|
|
|
|
def clean_text(text):
|
|
|
"""Clean messy headers/footers and normalize spacing."""
|
|
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
|
lines = text.split('\n')
|
|
|
cleaned_lines = []
|
|
|
for line in lines:
|
|
|
if sum(c.isalpha() for c in line) > 5:
|
|
|
cleaned_lines.append(line)
|
|
|
text = '\n'.join(cleaned_lines)
|
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
return text
|
|
|
|
|
|
def load_documents():
|
|
|
"""Manually load PDF and text documents from the 'data/' folder with proper encoding."""
|
|
|
data_dir = '../data'
|
|
|
documents = []
|
|
|
|
|
|
for root, _, files in os.walk(data_dir):
|
|
|
for file in files:
|
|
|
file_path = os.path.join(root, file)
|
|
|
if file.lower().endswith('.pdf'):
|
|
|
loader = PyPDFLoader(file_path)
|
|
|
print(f"Loading PDF {file_path}")
|
|
|
documents.extend(loader.load())
|
|
|
elif file.lower().endswith('.txt'):
|
|
|
print(f"Loading TXT {file_path}")
|
|
|
try:
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
text = f.read()
|
|
|
documents.append(Document(page_content=text, metadata={"source": file_path}))
|
|
|
except UnicodeDecodeError as e:
|
|
|
print(f"⚠ Skipping {file_path} due to encoding error: {e}")
|
|
|
else:
|
|
|
continue
|
|
|
return documents
|
|
|
|
|
|
def create_vector_db():
|
|
|
print("Step 1: Loading documents from the 'data/' directory...")
|
|
|
documents = load_documents()
|
|
|
|
|
|
if not documents:
|
|
|
print("No documents found in the 'data' directory. Exiting.")
|
|
|
return
|
|
|
|
|
|
print(f"Loaded {len(documents)} document(s).")
|
|
|
|
|
|
print("\nStep 2: Cleaning the text content...")
|
|
|
for doc in documents:
|
|
|
doc.page_content = clean_text(doc.page_content)
|
|
|
print("Text cleaning complete.")
|
|
|
|
|
|
print("\nStep 3: Splitting into chunks...")
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
chunk_size=1000,
|
|
|
chunk_overlap=100
|
|
|
)
|
|
|
chunks = text_splitter.split_documents(documents)
|
|
|
print(f"Created {len(chunks)} chunks.")
|
|
|
|
|
|
print("\nStep 4: Creating embeddings with HuggingFace...")
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
|
print("Step 5: Building FAISS index...")
|
|
|
db = FAISS.from_documents(chunks, embeddings)
|
|
|
db.save_local(DB_FAISS_PATH)
|
|
|
|
|
|
print(f"\n✅ Ingestion complete! Vector store saved at '{DB_FAISS_PATH}'")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
create_vector_db()
|
|
|
|