Gaia_-_BioMundo / main.py
pedrogrisi's picture
Upload folder using huggingface_hub
6cc929d verified
import os
import glob
import logging
from dotenv import load_dotenv
import gradio as gr
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv()
# Access API key
api_key = os.getenv("OPEN_AI_APIKEY")
MODEL = "gpt-4o-mini"
db_name = "vector_db"
# System prompt defining Gaia's personality and role
system_prompt = """
Você é Gaia, assistente virtual da Bio Mundo, com 26 anos. Sua personalidade é amigável, carismática e traz um toque de humor. Você ajuda funcionários e parceiros a:
- Gerir pessoas;
- Estimular um clima organizacional positivo;
- Aplicar valores éticos da empresa.
Quando o tema envolva ética, analise situações ou guie comportamentos de acordo com os seguintes princípios:
- Ética;
- Integridade;
- Conformidade com as leis e padrões de Bio Mundo;
- Foco na satisfação e proteção da marca.
Se surgir dúvida sobre condutas ou regras, explique os princípios de forma clara e educativa.
"""
# Carregamento dos documentos
folders = [f for f in glob.glob("knowledge-base/*") if os.path.isdir(f)]
files = [f for f in glob.glob("knowledge-base/*.txt") if os.path.isfile(f)]
def add_metadata(doc, doc_type):
doc.metadata["doc_type"] = doc_type
return doc
text_loader_kwargs = {
"encoding": "utf-8"
}
documents = []
# Carrega arquivos diretamente em knowledge-base/
for file in files:
loader = TextLoader(file, **text_loader_kwargs)
docs = loader.load()
documents.extend([add_metadata(doc, "root") for doc in docs])
# Carrega arquivos em subpastas (se existirem)
for folder in folders:
doc_type = os.path.basename(folder)
loader = DirectoryLoader(folder, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
docs = loader.load()
documents.extend([add_metadata(doc, doc_type) for doc in docs])
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"Total chunks: {len(chunks)}")
print(f"Documents types found: {set([doc.metadata['doc_type'] for doc in chunks])}")
if not chunks:
raise ValueError("Nenhum documento encontrado em 'knowledge-base'. Adicione arquivos .txt para continuar.")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=api_key)
# Add before vectorstore creation
logger.info("Starting vectorstore creation...")
logger.info(f"Number of documents: {len(chunks)}")
logger.info(f"Working directory: {os.getcwd()}")
def create_vectorstore(chunks, embeddings, db_name):
try:
if os.path.exists(db_name):
logger.info(f"Deleting existing collection in {db_name}")
Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
logger.info("Creating new vectorstore...")
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=db_name
)
logger.info(f"Created vectorstore with {len(chunks)} documents")
return vectorstore
except Exception as e:
logger.error(f"Error creating vectorstore: {str(e)}")
# Fallback to in-memory vectorstore if persistence fails
logger.info("Falling back to in-memory vectorstore")
return Chroma.from_documents(documents=chunks, embedding=embeddings)
# Replace existing vectorstore creation with new function
vectorstore = create_vectorstore(chunks, embeddings, db_name)
collection = vectorstore._collection
count = collection.count()
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL, api_key=api_key)
# set up the conversation memory for the chat
memory = ConversationBufferMemory(
memory_key='chat_history',
return_messages=True,
output_key='answer' # Specify which output to store
)
# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
# Create the prompt templates
condense_question_prompt = PromptTemplate.from_template("""
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:""")
qa_prompt = PromptTemplate.from_template(f"""
{system_prompt}
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{{context}}
Question: {{question}}
Helpful Answer:""")
# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=memory,
condense_question_prompt=condense_question_prompt,
combine_docs_chain_kwargs={"prompt": qa_prompt},
return_source_documents=True,
verbose=True
)
# Update the chat function to handle the response properly
def chat(question, history):
# Se for a primeira mensagem (histórico vazio), envie a mensagem de boas-vindas
if not history:
welcome_message = """Olá! Meu nome é Gaia, sou sua parceira na Bio Mundo. Estou aqui para ajudar com o que você precisar! Meus principais tópicos são:
- Gestão de pessoas;
- Clima organizacional positivo;
- Valores éticos da cultura Bio Mundo.
Posso te ajudar com algo específico hoje? 😊"""
return welcome_message
result = conversation_chain.invoke({
"question": question
})
return result["answer"]
# Set up the Gradio interface with fullscreen and default colors
with gr.Blocks(css="""
/* Make the interface full screen */
.gradio-container {
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
width: 100%;
height: 100%;
max-width: 100%;
max-height: 100%;
margin: 0;
padding: 0;
overflow: hidden;
}
/* Remove custom styling for bot messages to use default colors */
.message.bot {
/* Reset any custom styling */
}
/* Keep the user message styling */
.message.user {
background-color: #4CAF50 !important;
color: white !important;
}
/* Make the chatbot container taller */
#chatbot {
height: calc(100vh - 200px) !important;
}
/* Center the title */
.title-container {
text-align: center;
margin-bottom: 1rem;
}
/* Make the title text green */
.title-text {
color: #2E7D32;
font-size: 2.5rem;
}
""") as demo:
# Create a custom header
gr.HTML("""
""")
# Create the chatbot interface
chatbot = gr.Chatbot(
placeholder="<strong>Sou Gaia, parceira da Biomundo!</strong><br>Pergunte-me qualquer coisa!",
avatar_images=(None, "assets/Gaia.jpeg"),
type="messages",
elem_id="chatbot",
height=700 # Increased height
)
# Create the chat interface with only supported parameters
chat_interface = gr.ChatInterface(
fn=chat,
chatbot=chatbot,
examples=[
"Como posso melhorar o clima organizacional na minha unidade?",
"Qual é a política de não concorrência da Bio Mundo?",
"Como devo reconhecer o bom trabalho da minha equipe?",
"Um funcionário está desrespeitando outro. O que devo fazer?",
"Quais são as métricas avaliadas no Programa Comunidade Bio+?"
]
)
# Launch the Gradio app in fullscreen mode
if __name__ == "__main__":
demo.launch(share=True)