Spaces:

md-vasim
/

ExploreTextWithGemini

Sleeping

App Files Files Community

ExploreTextWithGemini / src /utils.py

md-vasim

first commit

e39ab22 about 2 years ago

raw

history blame contribute delete

3.28 kB

	import os
	from langchain.document_loaders import (
	PyPDFLoader,
	TextLoader,
	Docx2txtLoader
	)

	from langchain.text_splitter import CharacterTextSplitter
	# from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	import google.generativeai as genai
	from langchain.vectorstores import FAISS
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.chains.question_answering import load_qa_chain
	from langchain.prompts import PromptTemplate
	from langchain.memory import ConversationBufferMemory
	from dotenv import load_dotenv
	from src.agent import build_qa_chain
	import gradio as gr
	load_dotenv()

	genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

	class AgentChain:
	def __init__(self):
	self.agent = None
	self.db = None

	agent_chain = AgentChain()
	agent_chain.agent = build_qa_chain()

	def extract_text_from_files(docs):
	documents = []
	files = os.listdir(docs)

	if len(files) == 0:
	return "Directory is empty"

	base_dir = docs.split("/")
	base_dir = "/".join(base_dir)

	for file in files:
	if file.endswith(".pdf"):
	pdf_path=os.path.join(base_dir, file)
	loader=PyPDFLoader(pdf_path)
	documents.extend(loader.load())
	elif file.endswith('.docx') or file.endswith('.doc'):
	doc_path=os.path.join(base_dir, file)
	loader=Docx2txtLoader(doc_path)
	documents.extend(loader.load())
	elif file.endswith('.txt'):
	text_path=os.path.join(base_dir, file)
	loader=TextLoader(text_path)
	documents.extend(loader.load())
	return documents

	def extract_text_from_file(file):
	documents = []
	filename = str(file)
	if filename.endswith(".pdf"):
	loader=PyPDFLoader(file)
	documents.extend(loader.load())
	elif filename.endswith('.docx') or file.endswith('.doc'):
	loader=Docx2txtLoader(file)
	documents.extend(loader.load())
	elif filename.endswith('.txt'):
	loader=TextLoader(file)
	documents.extend(loader.load())
	print("Text extracted")
	return documents

	def get_text_chunks(text):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
	chunks = text_splitter.split_documents(text)
	print("Chunks splitted")
	return chunks

	def save_in_faiss(text_chunks, save=False):
	embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
	vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)
	if save:
	vector_store.save_local("faiss_index")
	print("Document search created")
	return vector_store

	def process_files(file):
	documents = extract_text_from_file(file)
	text_chunks = get_text_chunks(documents)
	vector_store = save_in_faiss(text_chunks)
	agent_chain.db = vector_store
	gr.Info("Processing completed")
	return file

	def answer_query(message, history):
	if agent_chain.db is not None:
	docs = agent_chain.db.similarity_search(message)
	docs = []
	response = agent_chain.agent({"input_documents": docs, "human_input": message}, return_only_outputs=True)
	return response['output_text']