Spaces:
Sleeping
Sleeping
Commit ·
7dc5f52
0
Parent(s):
first commit
Browse files- .gitignore +3 -0
- .vscode/launch.json +18 -0
- app/main.py +77 -0
- app/model/chat_agent.py +7 -0
- doc_parse/main.py +60 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/data/articles/
|
| 2 |
+
.env
|
| 3 |
+
venv
|
.vscode/launch.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
"name": "Start app",
|
| 9 |
+
"type": "debugpy",
|
| 10 |
+
"request": "launch",
|
| 11 |
+
"module": "streamlit",
|
| 12 |
+
"args": [
|
| 13 |
+
"run",
|
| 14 |
+
"app/main.py"
|
| 15 |
+
]
|
| 16 |
+
}
|
| 17 |
+
]
|
| 18 |
+
}
|
app/main.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_openai import OpenAIEmbeddings
|
| 4 |
+
from langchain_core.prompts import PromptTemplate
|
| 5 |
+
from langchain_core.messages import SystemMessage
|
| 6 |
+
from langchain_qdrant import QdrantVectorStore
|
| 7 |
+
from langchain_openai import ChatOpenAI
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
load_dotenv(dotenv_path=".env", override=True)
|
| 11 |
+
|
| 12 |
+
base_prompt = PromptTemplate.from_template(
|
| 13 |
+
"""
|
| 14 |
+
Responda a pergunta abaixo de acordo com base no contexto passado para você. O contexto será passado em formato JSON
|
| 15 |
+
onde teremos 3 referências para sua resposta, as chaves "source" indica o caminho do arquivo juntamente com
|
| 16 |
+
o nome do arquivo PDF, "page" que é a página localizada no arquivo PDF e "page_label" que é a página que está
|
| 17 |
+
sendo indicada no próprio texto, "page_content" é o conteúdo do arquivo,
|
| 18 |
+
caso tenha alguma outra chave pode somente ignorar. Lembre-se de sempre que usar
|
| 19 |
+
uma referência do contexto que está utilizando e responder em formato markdown.
|
| 20 |
+
|
| 21 |
+
Contexto: {context}
|
| 22 |
+
|
| 23 |
+
Pergunta: {question}
|
| 24 |
+
"""
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
if "messages" not in st.session_state:
|
| 28 |
+
st.session_state.messages = [
|
| 29 |
+
{
|
| 30 |
+
"role": "system",
|
| 31 |
+
"content": "Você é um assistente de pesquisa que ajuda a encontrar informações sobre proteomica"
|
| 32 |
+
}
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
for message in st.session_state.messages:
|
| 36 |
+
if message["role"] != "system":
|
| 37 |
+
with st.chat_message(message["role"]):
|
| 38 |
+
st.markdown(message["content"])
|
| 39 |
+
|
| 40 |
+
model = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv('OPENAI_KEY'))
|
| 41 |
+
|
| 42 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.getenv('OPENAI_KEY'))
|
| 43 |
+
|
| 44 |
+
vector_store = QdrantVectorStore.from_existing_collection(
|
| 45 |
+
url=os.getenv('QDRANT_URL'),
|
| 46 |
+
api_key=os.getenv('QDRANT_KEY'),
|
| 47 |
+
embedding=embeddings,
|
| 48 |
+
collection_name='proteomica',
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
input = st.chat_input("Digite sua pergunta:")
|
| 52 |
+
|
| 53 |
+
if input:
|
| 54 |
+
st.chat_message("user").markdown(input)
|
| 55 |
+
|
| 56 |
+
documentos = vector_store.similarity_search(input, k=3)
|
| 57 |
+
|
| 58 |
+
documentos_json = []
|
| 59 |
+
for documento in documentos:
|
| 60 |
+
documento_json = {
|
| 61 |
+
"page_content": documento.page_content,
|
| 62 |
+
"source": documento.metadata['source'],
|
| 63 |
+
"page_label": documento.metadata["page_label"],
|
| 64 |
+
"page": documento.metadata["page"],
|
| 65 |
+
}
|
| 66 |
+
documentos_json.append(documento_json)
|
| 67 |
+
|
| 68 |
+
prompt = base_prompt.format(context=documentos_json, question=input)
|
| 69 |
+
|
| 70 |
+
resposta = model.invoke(st.session_state.messages)
|
| 71 |
+
st.session_state.messages.append({"role": "assistant", "content": resposta.content})
|
| 72 |
+
|
| 73 |
+
st.chat_message("assistant").markdown(resposta.content)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
app/model/chat_agent.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from qdrant_client import QdrantClient
|
| 2 |
+
|
| 3 |
+
class ChatAgent:
|
| 4 |
+
def __init__(self, conn: QdrantClient):
|
| 5 |
+
self.conn = conn
|
| 6 |
+
|
| 7 |
+
|
doc_parse/main.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 2 |
+
from langchain_qdrant import QdrantVectorStore
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
from qdrant_client.models import Distance, VectorParams
|
| 5 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_openai import OpenAIEmbeddings
|
| 8 |
+
from uuid import uuid4
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
load_dotenv(dotenv_path=".env", override=True)
|
| 13 |
+
|
| 14 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.getenv('OPENAI_KEY'))
|
| 15 |
+
|
| 16 |
+
qdrant_client = QdrantClient(
|
| 17 |
+
url=os.getenv('QDRANT_URL'),
|
| 18 |
+
api_key=os.getenv('QDRANT_KEY'),
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
if qdrant_client.collection_exists("proteomica"):
|
| 22 |
+
qdrant_client.delete_collection("proteomica")
|
| 23 |
+
qdrant_client.create_collection(
|
| 24 |
+
collection_name="proteomica",
|
| 25 |
+
vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
vector_store = QdrantVectorStore(
|
| 30 |
+
client=qdrant_client,
|
| 31 |
+
embedding=embeddings,
|
| 32 |
+
collection_name="proteomica",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
caminho_artigos= "./data/articles"
|
| 36 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 37 |
+
|
| 38 |
+
for artigo in os.listdir(caminho_artigos):
|
| 39 |
+
if artigo.endswith('.pdf'):
|
| 40 |
+
try:
|
| 41 |
+
caminho_completo = os.path.join(caminho_artigos, artigo)
|
| 42 |
+
|
| 43 |
+
loader = PyPDFLoader(caminho_completo)
|
| 44 |
+
|
| 45 |
+
docs = loader.load()
|
| 46 |
+
|
| 47 |
+
splits = text_splitter.split_documents(docs)
|
| 48 |
+
|
| 49 |
+
uuids = [str(uuid4()) for _ in range(len(splits))]
|
| 50 |
+
|
| 51 |
+
vector_store.add_documents(splits, ids=uuids)
|
| 52 |
+
|
| 53 |
+
print(f"✅ Processado: {caminho_completo}")
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Erro ao processar: {caminho_completo}: {e}")
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|