Spaces:

pedrogrisi
/

Gaia_-_BioMundo

Sleeping

App Files Files Community

Gaia_-_BioMundo / main.py

pedrogrisi

Upload folder using huggingface_hub

6cc929d verified 10 months ago

raw

history blame contribute delete

8.34 kB

	import os
	import glob
	import logging
	from dotenv import load_dotenv
	import gradio as gr
	from langchain_community.document_loaders import DirectoryLoader, TextLoader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings, ChatOpenAI
	from langchain_chroma import Chroma
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain.prompts import PromptTemplate

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Load environment variables
	load_dotenv()

	# Access API key
	api_key = os.getenv("OPEN_AI_APIKEY")

	MODEL = "gpt-4o-mini"
	db_name = "vector_db"

	# System prompt defining Gaia's personality and role
	system_prompt = """
	Você é Gaia, assistente virtual da Bio Mundo, com 26 anos. Sua personalidade é amigável, carismática e traz um toque de humor. Você ajuda funcionários e parceiros a:
	- Gerir pessoas;
	- Estimular um clima organizacional positivo;
	- Aplicar valores éticos da empresa.

	Quando o tema envolva ética, analise situações ou guie comportamentos de acordo com os seguintes princípios:
	- Ética;
	- Integridade;
	- Conformidade com as leis e padrões de Bio Mundo;
	- Foco na satisfação e proteção da marca.

	Se surgir dúvida sobre condutas ou regras, explique os princípios de forma clara e educativa.
	"""

	# Carregamento dos documentos
	folders = [f for f in glob.glob("knowledge-base/*") if os.path.isdir(f)]
	files = [f for f in glob.glob("knowledge-base/*.txt") if os.path.isfile(f)]

	def add_metadata(doc, doc_type):
	doc.metadata["doc_type"] = doc_type
	return doc

	text_loader_kwargs = {
	"encoding": "utf-8"
	}

	documents = []

	# Carrega arquivos diretamente em knowledge-base/
	for file in files:
	loader = TextLoader(file, **text_loader_kwargs)
	docs = loader.load()
	documents.extend([add_metadata(doc, "root") for doc in docs])

	# Carrega arquivos em subpastas (se existirem)
	for folder in folders:
	doc_type = os.path.basename(folder)
	loader = DirectoryLoader(folder, glob="*/.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
	docs = loader.load()
	documents.extend([add_metadata(doc, doc_type) for doc in docs])

	text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
	chunks = text_splitter.split_documents(documents)

	print(f"Total chunks: {len(chunks)}")
	print(f"Documents types found: {set([doc.metadata['doc_type'] for doc in chunks])}")

	if not chunks:
	raise ValueError("Nenhum documento encontrado em 'knowledge-base'. Adicione arquivos .txt para continuar.")

	embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=api_key)

	# Add before vectorstore creation
	logger.info("Starting vectorstore creation...")
	logger.info(f"Number of documents: {len(chunks)}")
	logger.info(f"Working directory: {os.getcwd()}")

	def create_vectorstore(chunks, embeddings, db_name):
	try:
	if os.path.exists(db_name):
	logger.info(f"Deleting existing collection in {db_name}")
	Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

	logger.info("Creating new vectorstore...")
	vectorstore = Chroma.from_documents(
	documents=chunks,
	embedding=embeddings,
	persist_directory=db_name
	)
	logger.info(f"Created vectorstore with {len(chunks)} documents")
	return vectorstore
	except Exception as e:
	logger.error(f"Error creating vectorstore: {str(e)}")
	# Fallback to in-memory vectorstore if persistence fails
	logger.info("Falling back to in-memory vectorstore")
	return Chroma.from_documents(documents=chunks, embedding=embeddings)

	# Replace existing vectorstore creation with new function
	vectorstore = create_vectorstore(chunks, embeddings, db_name)

	collection = vectorstore._collection
	count = collection.count()

	sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
	dimensions = len(sample_embedding)
	print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

	# create a new Chat with OpenAI
	llm = ChatOpenAI(temperature=0.7, model_name=MODEL, api_key=api_key)

	# set up the conversation memory for the chat
	memory = ConversationBufferMemory(
	memory_key='chat_history',
	return_messages=True,
	output_key='answer' # Specify which output to store
	)

	# the retriever is an abstraction over the VectorStore that will be used during RAG
	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

	# Create the prompt templates
	condense_question_prompt = PromptTemplate.from_template("""
	Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

	Chat History:
	{chat_history}

	Follow Up Input: {question}
	Standalone question:""")

	qa_prompt = PromptTemplate.from_template(f"""
	{system_prompt}

	Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

	{{context}}

	Question: {{question}}
	Helpful Answer:""")

	# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=retriever,
	memory=memory,
	condense_question_prompt=condense_question_prompt,
	combine_docs_chain_kwargs={"prompt": qa_prompt},
	return_source_documents=True,
	verbose=True
	)

	# Update the chat function to handle the response properly
	def chat(question, history):
	# Se for a primeira mensagem (histórico vazio), envie a mensagem de boas-vindas
	if not history:
	welcome_message = """Olá! Meu nome é Gaia, sou sua parceira na Bio Mundo. Estou aqui para ajudar com o que você precisar! Meus principais tópicos são:
	- Gestão de pessoas;
	- Clima organizacional positivo;
	- Valores éticos da cultura Bio Mundo.

	Posso te ajudar com algo específico hoje? 😊"""
	return welcome_message

	result = conversation_chain.invoke({
	"question": question
	})
	return result["answer"]

	# Set up the Gradio interface with fullscreen and default colors
	with gr.Blocks(css="""
	/* Make the interface full screen */
	.gradio-container {
	position: absolute;
	top: 0;
	left: 0;
	right: 0;
	bottom: 0;
	width: 100%;
	height: 100%;
	max-width: 100%;
	max-height: 100%;
	margin: 0;
	padding: 0;
	overflow: hidden;
	}

	/* Remove custom styling for bot messages to use default colors */
	.message.bot {
	/* Reset any custom styling */
	}

	/* Keep the user message styling */
	.message.user {
	background-color: #4CAF50 !important;
	color: white !important;
	}

	/* Make the chatbot container taller */
	#chatbot {
	height: calc(100vh - 200px) !important;
	}

	/* Center the title */
	.title-container {
	text-align: center;
	margin-bottom: 1rem;
	}

	/* Make the title text green */
	.title-text {
	color: #2E7D32;
	font-size: 2.5rem;
	}
	""") as demo:
	# Create a custom header
	gr.HTML("""

	""")

	# Create the chatbot interface
	chatbot = gr.Chatbot(
	placeholder="<strong>Sou Gaia, parceira da Biomundo!</strong><br>Pergunte-me qualquer coisa!",
	avatar_images=(None, "assets/Gaia.jpeg"),
	type="messages",
	elem_id="chatbot",
	height=700 # Increased height
	)

	# Create the chat interface with only supported parameters
	chat_interface = gr.ChatInterface(
	fn=chat,
	chatbot=chatbot,
	examples=[
	"Como posso melhorar o clima organizacional na minha unidade?",
	"Qual é a política de não concorrência da Bio Mundo?",
	"Como devo reconhecer o bom trabalho da minha equipe?",
	"Um funcionário está desrespeitando outro. O que devo fazer?",
	"Quais são as métricas avaliadas no Programa Comunidade Bio+?"
	]
	)

	# Launch the Gradio app in fullscreen mode
	if __name__ == "__main__":
	demo.launch(share=True)