Spaces:

Subhakanta156
/

Odisha-Disaster-Chatbot

Sleeping

File size: 3,171 Bytes

4787e22

import os
import re
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load environment variables from .env file
load_dotenv()

# Define the path for the FAISS vector store
DB_FAISS_PATH = 'vectorStore'

def clean_text(text):
    """Clean messy headers/footers and normalize spacing."""
    text = re.sub(r'\n\s*\n', '\n\n', text)  # collapse multiple newlines
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if sum(c.isalpha() for c in line) > 5:  # keep if more than 5 letters
            cleaned_lines.append(line)
    text = '\n'.join(cleaned_lines)
    text = re.sub(r'\s+', ' ', text).strip()  # normalize spaces
    return text

def load_documents():
    """Manually load PDF and text documents from the 'data/' folder with proper encoding."""
    data_dir = '../data'  
    documents = []
    
    for root, _, files in os.walk(data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                print(f"Loading PDF {file_path}")
                documents.extend(loader.load())
            elif file.lower().endswith('.txt'):
                print(f"Loading TXT {file_path}")
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                    documents.append(Document(page_content=text, metadata={"source": file_path}))
                except UnicodeDecodeError as e:
                    print(f"⚠ Skipping {file_path} due to encoding error: {e}")
            else:
                continue
    return documents

def create_vector_db():
    print("Step 1: Loading documents from the 'data/' directory...")
    documents = load_documents()
    
    if not documents:
        print("No documents found in the 'data' directory. Exiting.")
        return

    print(f"Loaded {len(documents)} document(s).")

    print("\nStep 2: Cleaning the text content...")
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)
    print("Text cleaning complete.")

    print("\nStep 3: Splitting into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")

    print("\nStep 4: Creating embeddings with HuggingFace...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    print("Step 5: Building FAISS index...")
    db = FAISS.from_documents(chunks, embeddings)
    db.save_local(DB_FAISS_PATH)

    print(f"\n✅ Ingestion complete! Vector store saved at '{DB_FAISS_PATH}'")

if __name__ == "__main__":
    create_vector_db()