Spaces:

Subhakanta156
/

Odisha-Disaster-Chatbot

Sleeping

Odisha-Disaster-Chatbot / src /ingest.py

Subhakanta

Initial commit without data folder

4787e22 4 months ago

3.17 kB

	import os
	import re
	from dotenv import load_dotenv

	from langchain_core.documents import Document
	from langchain_community.document_loaders import PyPDFLoader, TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	# Load environment variables from .env file
	load_dotenv()

	# Define the path for the FAISS vector store
	DB_FAISS_PATH = 'vectorStore'

	def clean_text(text):
	"""Clean messy headers/footers and normalize spacing."""
	text = re.sub(r'\n\s*\n', '\n\n', text) # collapse multiple newlines
	lines = text.split('\n')
	cleaned_lines = []
	for line in lines:
	if sum(c.isalpha() for c in line) > 5: # keep if more than 5 letters
	cleaned_lines.append(line)
	text = '\n'.join(cleaned_lines)
	text = re.sub(r'\s+', ' ', text).strip() # normalize spaces
	return text

	def load_documents():
	"""Manually load PDF and text documents from the 'data/' folder with proper encoding."""
	data_dir = '../data'
	documents = []

	for root, _, files in os.walk(data_dir):
	for file in files:
	file_path = os.path.join(root, file)
	if file.lower().endswith('.pdf'):
	loader = PyPDFLoader(file_path)
	print(f"Loading PDF {file_path}")
	documents.extend(loader.load())
	elif file.lower().endswith('.txt'):
	print(f"Loading TXT {file_path}")
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	documents.append(Document(page_content=text, metadata={"source": file_path}))
	except UnicodeDecodeError as e:
	print(f"⚠ Skipping {file_path} due to encoding error: {e}")
	else:
	continue
	return documents

	def create_vector_db():
	print("Step 1: Loading documents from the 'data/' directory...")
	documents = load_documents()

	if not documents:
	print("No documents found in the 'data' directory. Exiting.")
	return

	print(f"Loaded {len(documents)} document(s).")

	print("\nStep 2: Cleaning the text content...")
	for doc in documents:
	doc.page_content = clean_text(doc.page_content)
	print("Text cleaning complete.")

	print("\nStep 3: Splitting into chunks...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100
	)
	chunks = text_splitter.split_documents(documents)
	print(f"Created {len(chunks)} chunks.")

	print("\nStep 4: Creating embeddings with HuggingFace...")
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	print("Step 5: Building FAISS index...")
	db = FAISS.from_documents(chunks, embeddings)
	db.save_local(DB_FAISS_PATH)

	print(f"\n✅ Ingestion complete! Vector store saved at '{DB_FAISS_PATH}'")

	if __name__ == "__main__":
	create_vector_db()