File size: 3,171 Bytes
4787e22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import re
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load environment variables from .env file
load_dotenv()

# Define the path for the FAISS vector store
DB_FAISS_PATH = 'vectorStore'

def clean_text(text):
    """Clean messy headers/footers and normalize spacing."""
    text = re.sub(r'\n\s*\n', '\n\n', text)  # collapse multiple newlines
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if sum(c.isalpha() for c in line) > 5:  # keep if more than 5 letters
            cleaned_lines.append(line)
    text = '\n'.join(cleaned_lines)
    text = re.sub(r'\s+', ' ', text).strip()  # normalize spaces
    return text

def load_documents():
    """Manually load PDF and text documents from the 'data/' folder with proper encoding."""
    data_dir = '../data'  
    documents = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                print(f"Loading PDF {file_path}")
                documents.extend(loader.load())
            elif file.lower().endswith('.txt'):
                print(f"Loading TXT {file_path}")
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                    documents.append(Document(page_content=text, metadata={"source": file_path}))
                except UnicodeDecodeError as e:
                    print(f"⚠ Skipping {file_path} due to encoding error: {e}")
            else:
                continue
    return documents

def create_vector_db():
    print("Step 1: Loading documents from the 'data/' directory...")
    documents = load_documents()
    
    if not documents:
        print("No documents found in the 'data' directory. Exiting.")
        return

    print(f"Loaded {len(documents)} document(s).")

    print("\nStep 2: Cleaning the text content...")
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)
    print("Text cleaning complete.")

    print("\nStep 3: Splitting into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")

    print("\nStep 4: Creating embeddings with HuggingFace...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    print("Step 5: Building FAISS index...")
    db = FAISS.from_documents(chunks, embeddings)
    db.save_local(DB_FAISS_PATH)

    print(f"\n✅ Ingestion complete! Vector store saved at '{DB_FAISS_PATH}'")

if __name__ == "__main__":
    create_vector_db()