My_Self_Bot / src /ingest.py
BOB0920's picture
Update src/ingest.py
8c79180 verified
Raw
History Blame Contribute Delete
1.92 kB
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
DATA_PATH = "data"
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
DB_FAISS_PATH = os.path.join(SCRIPT_DIR, "vectorstore", "db_faiss")
def create_vector_db():
documents = []
# Check if data directory exists
if not os.path.exists(DATA_PATH):
print(f"Directory {DATA_PATH} not found.")
return
# Load documents
for filename in os.listdir(DATA_PATH):
file_path = os.path.join(DATA_PATH, filename)
if filename.endswith(".pdf"):
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
print(f"Loaded {filename}")
elif filename.endswith(".txt"):
loader = TextLoader(file_path, encoding='utf-8')
documents.extend(loader.load())
print(f"Loaded {filename}")
if not documents:
print("No documents found to ingest.")
return
# Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
print(f"Split documents into {len(texts)} chunks.")
# Create embeddings (using HuggingFace - FREE!)
print("Generating embeddings locally with sentence-transformers...")
embeddings = HuggingFaceEmbeddings(
model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'}
)
# Create vector store
db = FAISS.from_documents(texts, embeddings)
db.save_local(DB_FAISS_PATH)
print(f"Vector store saved to {DB_FAISS_PATH}")
if __name__ == "__main__":
create_vector_db()