dauid64 commited on
Commit
7dc5f52
·
0 Parent(s):

first commit

Browse files
Files changed (5) hide show
  1. .gitignore +3 -0
  2. .vscode/launch.json +18 -0
  3. app/main.py +77 -0
  4. app/model/chat_agent.py +7 -0
  5. doc_parse/main.py +60 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /data/articles/
2
+ .env
3
+ venv
.vscode/launch.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Start app",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "module": "streamlit",
12
+ "args": [
13
+ "run",
14
+ "app/main.py"
15
+ ]
16
+ }
17
+ ]
18
+ }
app/main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_openai import OpenAIEmbeddings
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain_core.messages import SystemMessage
6
+ from langchain_qdrant import QdrantVectorStore
7
+ from langchain_openai import ChatOpenAI
8
+ import streamlit as st
9
+
10
+ load_dotenv(dotenv_path=".env", override=True)
11
+
12
+ base_prompt = PromptTemplate.from_template(
13
+ """
14
+ Responda a pergunta abaixo de acordo com base no contexto passado para você. O contexto será passado em formato JSON
15
+ onde teremos 3 referências para sua resposta, as chaves "source" indica o caminho do arquivo juntamente com
16
+ o nome do arquivo PDF, "page" que é a página localizada no arquivo PDF e "page_label" que é a página que está
17
+ sendo indicada no próprio texto, "page_content" é o conteúdo do arquivo,
18
+ caso tenha alguma outra chave pode somente ignorar. Lembre-se de sempre que usar
19
+ uma referência do contexto que está utilizando e responder em formato markdown.
20
+
21
+ Contexto: {context}
22
+
23
+ Pergunta: {question}
24
+ """
25
+ )
26
+
27
+ if "messages" not in st.session_state:
28
+ st.session_state.messages = [
29
+ {
30
+ "role": "system",
31
+ "content": "Você é um assistente de pesquisa que ajuda a encontrar informações sobre proteomica"
32
+ }
33
+ ]
34
+
35
+ for message in st.session_state.messages:
36
+ if message["role"] != "system":
37
+ with st.chat_message(message["role"]):
38
+ st.markdown(message["content"])
39
+
40
+ model = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv('OPENAI_KEY'))
41
+
42
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.getenv('OPENAI_KEY'))
43
+
44
+ vector_store = QdrantVectorStore.from_existing_collection(
45
+ url=os.getenv('QDRANT_URL'),
46
+ api_key=os.getenv('QDRANT_KEY'),
47
+ embedding=embeddings,
48
+ collection_name='proteomica',
49
+ )
50
+
51
+ input = st.chat_input("Digite sua pergunta:")
52
+
53
+ if input:
54
+ st.chat_message("user").markdown(input)
55
+
56
+ documentos = vector_store.similarity_search(input, k=3)
57
+
58
+ documentos_json = []
59
+ for documento in documentos:
60
+ documento_json = {
61
+ "page_content": documento.page_content,
62
+ "source": documento.metadata['source'],
63
+ "page_label": documento.metadata["page_label"],
64
+ "page": documento.metadata["page"],
65
+ }
66
+ documentos_json.append(documento_json)
67
+
68
+ prompt = base_prompt.format(context=documentos_json, question=input)
69
+
70
+ resposta = model.invoke(st.session_state.messages)
71
+ st.session_state.messages.append({"role": "assistant", "content": resposta.content})
72
+
73
+ st.chat_message("assistant").markdown(resposta.content)
74
+
75
+
76
+
77
+
app/model/chat_agent.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from qdrant_client import QdrantClient
2
+
3
+ class ChatAgent:
4
+ def __init__(self, conn: QdrantClient):
5
+ self.conn = conn
6
+
7
+
doc_parse/main.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_qdrant import QdrantVectorStore
3
+ from qdrant_client import QdrantClient
4
+ from qdrant_client.models import Distance, VectorParams
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from uuid import uuid4
9
+ from dotenv import load_dotenv
10
+ import os
11
+
12
+ load_dotenv(dotenv_path=".env", override=True)
13
+
14
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.getenv('OPENAI_KEY'))
15
+
16
+ qdrant_client = QdrantClient(
17
+ url=os.getenv('QDRANT_URL'),
18
+ api_key=os.getenv('QDRANT_KEY'),
19
+ )
20
+
21
+ if qdrant_client.collection_exists("proteomica"):
22
+ qdrant_client.delete_collection("proteomica")
23
+ qdrant_client.create_collection(
24
+ collection_name="proteomica",
25
+ vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
26
+ )
27
+
28
+
29
+ vector_store = QdrantVectorStore(
30
+ client=qdrant_client,
31
+ embedding=embeddings,
32
+ collection_name="proteomica",
33
+ )
34
+
35
+ caminho_artigos= "./data/articles"
36
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
37
+
38
+ for artigo in os.listdir(caminho_artigos):
39
+ if artigo.endswith('.pdf'):
40
+ try:
41
+ caminho_completo = os.path.join(caminho_artigos, artigo)
42
+
43
+ loader = PyPDFLoader(caminho_completo)
44
+
45
+ docs = loader.load()
46
+
47
+ splits = text_splitter.split_documents(docs)
48
+
49
+ uuids = [str(uuid4()) for _ in range(len(splits))]
50
+
51
+ vector_store.add_documents(splits, ids=uuids)
52
+
53
+ print(f"✅ Processado: {caminho_completo}")
54
+
55
+ except Exception as e:
56
+ print(f"❌ Erro ao processar: {caminho_completo}: {e}")
57
+ continue
58
+
59
+
60
+