akshansh36 commited on
Commit
cd09618
·
verified ·
1 Parent(s): 1b32752

Create utils/tools.py

Browse files
Files changed (1) hide show
  1. src/utils/tools.py +72 -0
src/utils/tools.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.tools import tool
2
+ import pinecone
3
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
4
+ import os
5
+ from config import PINECONE_INDEX
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+ GOOGLE_API_KEY = os.getenv("FLASH_API")
10
+ PINECONE_API = os.getenv("PINECONE_API_KEY")
11
+
12
+ google_embeddings = GoogleGenerativeAIEmbeddings(
13
+ model="models/embedding-001", # Correct model name
14
+ google_api_key=GOOGLE_API_KEY
15
+ )
16
+
17
+ pc = pinecone.Pinecone(
18
+ api_key=PINECONE_API
19
+ )
20
+
21
+
22
+ index = pc.Index(PINECONE_INDEX)
23
+
24
+ @tool
25
+ def get_context(query: str) -> str:
26
+ """
27
+ Retrieve context information by performing a semantic search on indexed document chunks.
28
+
29
+ This tool embeds the provided user query using a Google Generative AI embeddings model,
30
+ then queries a Pinecone index to fetch the top 10 matching document chunks. Each match
31
+ includes metadata such as the text chunk, starting page, ending page, and the source PDF URL.
32
+ The function aggregates these details into a formatted string.
33
+
34
+ Args:
35
+ query (str): A user query search string used for semantic matching against the document index.
36
+
37
+ Returns:
38
+ str: A formatted string containing the matched document chunks along with their associated metadata,
39
+ including start page, end page, and PDF URL.
40
+ """
41
+ embedding = google_embeddings.embed_query(query)
42
+ search_results = index.query(
43
+ vector=embedding,
44
+ top_k=15, # Retrieve top 10 results
45
+ include_metadata=True
46
+ )
47
+
48
+ print(search_results)
49
+ context = " "
50
+ count = 1
51
+ for match in search_results["matches"]:
52
+ chunk = match["metadata"].get("chunk")
53
+ url = match["metadata"].get("url")
54
+
55
+ context += f"""
56
+ Chunk {count}:
57
+ {chunk}
58
+ webpage_url: {url}
59
+ #########################################
60
+ """
61
+ count += 1
62
+
63
+ return context
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+