ayushsinghal1510 commited on
Commit
cc65c1f
·
1 Parent(s): 60904cd

Init COmmit

Browse files
.env.example ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MILVUS_DB_NAME=''
2
+ MIVLUS_MODEL_NAME=''
3
+ MIVLUS_VECTOR_SIZE=''
4
+ MILVUS_COLLECTION_NAME=''
5
+
6
+ REDIS_HOST=''
7
+ REDIS_PORT=''
8
+
9
+ GEMINI_API_KEY=''
10
+ GEMINI_MODEL=''
11
+
12
+ GROQ_API_KEY=''
13
+ GROQ_MODEL=''
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+ .env
38
+ .pyc
39
+ __pycache__/
40
+ *.py[cod]
41
+ *$py.class
42
+ .db
43
+ .lock
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ *.env
2
+ *.rdb
3
+ *.pyc
4
+ *.pdf
5
+ *.log
6
+
7
+ temp.json
8
+
9
+ __pycache__/
README.md CHANGED
@@ -7,4 +7,4 @@ sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  pinned: false
8
  ---
9
 
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+
3
+ from logging import Logger
4
+ from dotenv import load_dotenv
5
+
6
+ from fastapi import FastAPI , Request , HTTPException
7
+
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+
10
+ from scripts.scrapper.services import get_pdf_links
11
+
12
+ from scripts.loader.loader import (
13
+ load_embedding_model ,
14
+ load_milvus_client ,
15
+ load_redis_client ,
16
+ load_embedding_model ,
17
+ load_gemini_client ,
18
+ load_groq_client ,
19
+ load_logger
20
+ )
21
+
22
+ from scripts.routers.routers import (
23
+ scrape_page_route ,
24
+ scrape_pdf_route ,
25
+ ask_route ,
26
+ )
27
+
28
+ load_dotenv()
29
+
30
+ milvus_client = load_milvus_client()
31
+ chat_redis_client = load_redis_client(0)
32
+ url_redis_client = load_redis_client(2)
33
+
34
+ embedding_model = load_embedding_model()
35
+
36
+ gemini_client = load_gemini_client()
37
+ groq_client = load_groq_client()
38
+ logger : Logger = load_logger()
39
+
40
+ app = FastAPI()
41
+ app.add_middleware(
42
+ CORSMiddleware ,
43
+ allow_origins = ['*'] ,
44
+ allow_credentials = True ,
45
+ allow_methods = ['*'] ,
46
+ allow_headers = ['*'] ,
47
+ )
48
+
49
+ logger.info('Server Startup')
50
+
51
+ @app.get('/')
52
+ async def read_root() : return {'Hello' : 'World'}
53
+
54
+ @app.post('/scrape-url')
55
+ async def scrape_url(request : Request) -> dict :
56
+
57
+ data : dict = await request.json()
58
+
59
+ if 'url' not in data : raise HTTPException(
60
+ status_code = 400 ,
61
+ detail = 'URL was not supplied'
62
+ )
63
+
64
+ url : str = data['url']
65
+
66
+ pdf_links , all_links = await get_pdf_links(url)
67
+
68
+ return {
69
+ 'pdf_links' : pdf_links ,
70
+ 'all_links' : all_links
71
+ }
72
+
73
+ @app.post('/scrape-page')
74
+ async def scrape_page(request : Request) -> None :
75
+
76
+ data : dict = await request.json()
77
+
78
+ if (
79
+ 'url' not in data or
80
+ 'scrape-images' not in data or
81
+ 'api_key' not in data
82
+ ) : raise HTTPException(
83
+ status_code = 400 ,
84
+ detail = 'Correct Params was not supplied'
85
+ )
86
+
87
+ url : str = data['url']
88
+ scrape_images : bool = data['scrape-images']
89
+ api_key : str = data['api_key']
90
+
91
+ await scrape_page_route(
92
+ url ,
93
+ api_key ,
94
+ logger ,
95
+ embedding_model ,
96
+ milvus_client ,
97
+ gemini_client ,
98
+ url_redis_client ,
99
+ scrape_images
100
+ )
101
+
102
+ @app.post('/scrape-pdf')
103
+ async def scrape_pdf(request : Request) -> None :
104
+
105
+ data : dict = await request.json()
106
+
107
+ if (
108
+ 'url' not in data or
109
+ 'scrape-images' not in data or
110
+ 'api_key' not in data
111
+ ) : raise HTTPException(
112
+ status_code = 400 ,
113
+ detail = 'Correct Params was not supplied'
114
+ )
115
+
116
+ url : str = data['url']
117
+ scrape_images : bool = data['scrape-images']
118
+ api_key : str = data['api_key']
119
+
120
+ await scrape_pdf_route(
121
+ url ,
122
+ api_key ,
123
+ logger ,
124
+ embedding_model ,
125
+ milvus_client ,
126
+ gemini_client ,
127
+ url_redis_client ,
128
+ scrape_images
129
+ )
130
+
131
+ @app.post('/ask')
132
+ async def ask(request : Request) -> dict :
133
+
134
+ data : dict = await request.json()
135
+
136
+ if (
137
+ 'query' not in data or
138
+ 'session_id' not in data or
139
+ 'api_key' not in data
140
+ ) : raise HTTPException(
141
+ status_code = 400 ,
142
+ detail = 'Correct Params was not supplied'
143
+ )
144
+
145
+ query : str = data['query']
146
+ session_id : str = data['session_id']
147
+ api_key : str = data['api_key']
148
+
149
+ response : str = await ask_route(
150
+ query ,
151
+ session_id ,
152
+ api_key ,
153
+ logger ,
154
+ embedding_model ,
155
+ milvus_client ,
156
+ chat_redis_client ,
157
+ groq_client
158
+ )
159
+
160
+ return {'response' : response}
161
+
162
+ if __name__ == '__main__' : uvicorn.run(
163
+ 'app:app' ,
164
+ host = '0.0.0.0' ,
165
+ port = 8002
166
+ )
database/prompt/image_ingestion.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ You are a Image Description Expert
2
+
3
+ - You will be provided with a image
4
+ - You will be provided with some text sorrounded by it
5
+
6
+ - Your task is to provide detailed description of the image
7
+ - Go as detailed as you can
8
+
9
+ Sourrounded Text : {}
database/prompt/rag.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **Objective:**
2
+ 1. Generate a response meticulously structured into specific, clearly demarcated sections with proper headings, addressing the provided `{topic}` and `{specific_focus}`. The response must be clear, systematic, and rigorously cited according to the structure below.
3
+ 2. Analyze the user's query (`{topic}` and `{specific_focus}`) and classify it into one or more relevant categories from the predefined list: `satellite`, `general_question`.
4
+ 3. Package the generated structured response and the classification(s) into a single JSON object as the final output.
5
+
6
+ **Your Task:**
7
+ First, construct the structured textual response. Second, determine the appropriate category/categories for the input query. Third, combine these into the specified JSON format.
8
+
9
+ **Part 1: Structured Response Generation**
10
+
11
+ Construct your textual output strictly adhering to the following structure, using the specified Markdown headings precisely as shown. Ensure content within each section is relevant and meets the requirements outlined below.
12
+
13
+ **Required Textual Output Structure and Content:**
14
+
15
+ ## Background
16
+
17
+ * **Heading Requirement:** Use the exact heading `## Background`.
18
+ * **Content:** Provide concise contextual information relevant to the `{topic}` and `{specific_focus}`. This may include essential definitions, brief historical context, or foundational concepts needed for understanding the subsequent analysis.
19
+ * **Citation:** Factual statements must be supported by evidence, with citations referring to the "Sources/Citations" section (limit to 3 unique sources recommended for brevity unless essential).
20
+
21
+ ## Response
22
+
23
+ * **Heading Requirement:** Use the exact heading `## Response`.
24
+ * **Content & Structure:** This is the core analytical section. Directly address the `{specific_focus}` concerning the `{topic}`.
25
+ * **Sub-headings:** **Crucially, use appropriate sub-headings (e.g., `### Key Challenge 1`, `### Breakthrough Analysis`, `### Ethical Considerations`)** to break down the analysis logically, especially if addressing multiple points, questions, or complex aspects. This enhances readability and organization.
26
+ * **Systematic Approach:** Address all elements requested or implied within `{specific_focus}` methodically and thoroughly.
27
+ * **Clarity:** Use precise language. Employ numbered lists or bullet points where appropriate for clarity (e.g., listing factors, steps, findings).
28
+ * **In-Text Citations:** **Mandatory:** All factual claims, data, statistics, direct quotes, or paraphrased specific ideas originating from external sources *must* be cited in-text (e.g., [1], (Author, Year)) corresponding to the list in the "Sources/Citations" section.
29
+
30
+ ## Sources/Citations
31
+
32
+ * **Heading Requirement:** Use the exact heading `## Sources/Citations`.
33
+ * **Content:** List all sources cited in the "Background" and "Response" sections.
34
+ * **Format:** Use a consistent citation style (e.g., APA, MLA, Chicago, Vancouver, Numbered). State the style used if possible. Ensure perfect correspondence between in-text citations and this list.
35
+
36
+ **Input Placeholders for Response Generation:**
37
+ * `{topic}`: The general subject area, concept, or entity to be discussed.
38
+ * `{specific_focus}`: The specific sub-topic, question(s), elements to compare, criteria, or perspective for the "Response" section.
39
+
40
+ **Part 2: Query Classification**
41
+
42
+ * **Analyze:** Based on the provided `{topic}` and `{specific_focus}`, determine the most fitting category/categories.
43
+ * **Allowed Categories:** You **must** choose from the following list only:
44
+ * Data Products, Services and Policies
45
+ * EO Missions
46
+ * Applications
47
+ * Remote Sensing and GIS
48
+ * International Collaboration and Cooperation
49
+ * General Questions
50
+ * **Selection:** You can select one or multiple categories if applicable. If unsure or if it doesn't fit well, lean towards `general_question`. Do not use any categories not present in this list.
51
+
52
+ **Part 3: Final Output Format**
53
+
54
+ * **Format Requirement:** Your entire output **must** be a single JSON object.
55
+ * **Structure:** The JSON object must have exactly two keys:
56
+ * `'response'`: The value should be a single string containing the complete, structured Markdown text generated in Part 1 (including all headings, content, and citations). Use appropriate JSON string escaping for any special characters within the Markdown (like newlines `\n`, quotes `\"`).
57
+ * `'category'`: The value should be a JSON list (array) containing the string(s) of the selected category/categories from Part 2.
58
+
59
+ **Example JSON Output Structure:**
60
+ ```json
61
+ {
62
+ "response": "## Background\\n...\\n\\n## Response\\n### Sub-heading 1\\n...\\n\\n## Sources/Citations\\n...",
63
+ "category": ["satellite"]
64
+ }
65
+ ```
66
+ OR
67
+ ```json
68
+ {
69
+ "response": "## Background\\n...\\n\\n## Response\\n### Analysis Point\\n...\\n\\n## Sources/Citations\\n...",
70
+ "category": ["general_question"]
71
+ }
72
+ ```
73
+
74
+ **Execution Mandate:** Generate the structured response as described, classify the query using *only* the allowed categories, and return the final result strictly in the specified JSON format. Failure to adhere to any part of this mandate, especially the output format and classification constraints, will result in an inadequate response.
database/vectordb/.temp ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.13.4
2
+ docker==7.1.0
3
+ fastapi==0.115.12
4
+ google-genai
5
+ google-generativeai
6
+ google-ai-generativelanguage
7
+ groq==0.23.1
8
+ hiredis==3.1.0
9
+ huggingface-hub==0.30.2
10
+ milvus-lite==2.4.12
11
+ numpy==1.26.4
12
+ pdf2image==1.17.0
13
+ pillow==11.2.1
14
+ pymilvus==2.5.8
15
+ PyMuPDF==1.25.5
16
+ PyPDF2==3.0.1
17
+ python-dotenv==1.1.0
18
+ python-multipart==0.0.20
19
+ redis==6.0.0
20
+ requests==2.32.3
21
+ safetensors==0.5.3
22
+ sentence-transformers==4.1.0
23
+ tokenizers==0.21.1
24
+ tqdm==4.67.1
25
+ transformers==4.49.0
26
+ uvicorn==0.34.2
27
+ wasabi
scripts/llm/image.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from PIL import Image
3
+
4
+ def caption_image(image_bytes : bytes , image_model , surrounding_text : str ) -> str :
5
+
6
+ with open('assets/database/prompt/image_ingestion.md') as image_ingestion_prompt_file : prompt = image_ingestion_prompt_file.read().format(surrounding_text)
7
+
8
+ image = Image.open(io.BytesIO(image_bytes))
9
+
10
+ response = image_model.generate_content([image , prompt])
11
+ response : str = response.text
12
+
13
+ return response
14
+
scripts/llm/runner.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from groq import Groq
2
+
3
+ async def run_groq(messages : list , groq_client : Groq, model = 'llama-3.3-70b-versatile') -> str :
4
+
5
+ chat_completion = groq_client.chat.completions.create(
6
+ messages = messages ,
7
+ model = model ,
8
+ response_format = {'type' : 'json_object'}
9
+ )
10
+
11
+ return chat_completion.choices[0].message.content
scripts/llm/services.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ async def load_history(redis_client , session_id) :
4
+
5
+ # ! ----------------------------only for dev----------------------------
6
+
7
+ # with open('assets/history/chat.json') as chat_file : history = json.load(chat_file)
8
+
9
+ # if session_id in history : return history[session_id]
10
+ # else : return []
11
+
12
+ # ! --------------------------------------------------------
13
+
14
+ messages = redis_client.get(session_id)
15
+ if messages : return json.loads(messages)
16
+ return []
17
+
18
+ async def save_history(redis_client , row , session_id) :
19
+
20
+ # ! ----------------------------only for Dev----------------------------
21
+
22
+ # with open('assets/history/chat.json') as chat_file : history = json.load(chat_file)
23
+
24
+ # history[session_id] = row
25
+
26
+ # with open('assets/history/chat.json' , 'w') as chat_file : json.dump(history , chat_file)
27
+
28
+ # ! --------------------------------------------------------
29
+
30
+ redis_client.set(session_id , json.dumps(row))
scripts/loader/loader.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline , AutoTokenizer
2
+ from pymilvus import MilvusClient
3
+ import os
4
+ from redis import Redis
5
+ from sentence_transformers import SentenceTransformer
6
+ import google.generativeai as genai
7
+ from pymilvus import Collection
8
+ from groq import Groq
9
+
10
+ from transformers.pipelines import Pipeline
11
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
12
+
13
+ from logging import Logger , getLogger , basicConfig , INFO , FileHandler , Formatter
14
+
15
+ def load_logger() -> Logger :
16
+
17
+ if not os.path.exists('assets/logs') : os.makedirs('assets/logs')
18
+
19
+ logger : Logger = getLogger(__name__)
20
+
21
+ basicConfig(
22
+ level = INFO ,
23
+ format = '%(asctime)s - %(levelname)s - %(message)s'
24
+ )
25
+
26
+ file_handler = FileHandler('assets/logs/log.log')
27
+ file_handler.setLevel(INFO)
28
+ file_handler.setFormatter(Formatter('%(asctime)s - %(levelname)s - %(message)s'))
29
+
30
+ logger.addHandler(file_handler)
31
+
32
+ return logger
33
+
34
+ def load_tokenizer(model_name = 'meta-llama/Meta-Llama-3-8B') -> PreTrainedTokenizerFast :
35
+
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
37
+
38
+ return tokenizer
39
+
40
+ def load_milvus_client() -> MilvusClient:
41
+
42
+ db_name = os.getenv('MILVUS_DB_NAME' , 'assets/database/vectordb/demo.db')
43
+
44
+ milvus_client = MilvusClient(db_name)
45
+
46
+ return milvus_client
47
+
48
+ def load_redis_client(db_name : int) -> Redis :
49
+
50
+ redis_client = Redis(
51
+ host = os.getenv('REDIS_HOST' , 'localhost') ,
52
+ port = int(os.getenv('REDIS_PORT' , 6379)) ,
53
+ db = db_name ,
54
+ decode_responses = True
55
+ )
56
+
57
+ return redis_client
58
+
59
+ def load_embedding_model(model_name = '') -> SentenceTransformer :
60
+
61
+ if not model_name : model_name = os.getenv('EMBEDDING_MODEL_NAME' , 'all-MiniLM-L6-v2')
62
+
63
+ embedding_model = SentenceTransformer(model_name)
64
+
65
+ return embedding_model
66
+
67
+ def load_gemini_client() :
68
+
69
+ gemini_api_key = os.getenv('GEMINI_API_KEY' , '')
70
+
71
+ if not gemini_api_key : model = ''
72
+ else :
73
+ genai.configure(api_key = '<Enter the Gemini API Key here>') # ! Can deploy a Llama 3.2 Model and use that instead, which can increase speed and avoid rate limits and increase safety as well
74
+ model = genai.GenerativeModel('gemini-1.5-flash')
75
+
76
+ return model
77
+
78
+ def load_groq_client() -> Groq :
79
+
80
+ groq_client = Groq()
81
+
82
+ return groq_client
scripts/routers/routers.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import numpy as np
4
+
5
+ from scripts.llm.runner import run_groq
6
+ from scripts.scrapper.page import page_to_docs
7
+
8
+ from scripts.routers.services import hash_url , clean_redis
9
+ from scripts.llm.services import save_history , load_history
10
+ from scripts.scrapper.pdf import pdf_to_docs , pdf_file_to_docs
11
+
12
+ # ! --------------------------------------Typing Annotations--------------------------------------
13
+
14
+ from logging import Logger
15
+ from sentence_transformers import SentenceTransformer
16
+ from pymilvus import MilvusClient
17
+ from redis import Redis
18
+ from groq import Groq
19
+
20
+ async def add_to_milvus(
21
+ documents : list ,
22
+ milvus_client : MilvusClient ,
23
+ embedding_model : SentenceTransformer ,
24
+ url_prefix : int ,
25
+ url : str ,
26
+ api_key : str
27
+ ) -> list :
28
+
29
+ milvus_client.create_collection(collection_name = api_key , dimension = 384)
30
+
31
+ texts = [document['text'] for document in documents]
32
+ embeddings : np.ndarray = embedding_model.encode(texts[ : 100_000] , show_progress_bar = True)
33
+ ids = []
34
+ chunk_counter = 1
35
+
36
+ for document , embedding in zip(documents , embeddings) :
37
+
38
+ id_ = url_prefix * 100_000 + chunk_counter
39
+ chunk_counter += 1
40
+
41
+ ids.append(id_)
42
+
43
+ data = {
44
+ 'id' : id_ ,
45
+ 'vector' : embedding
46
+ }
47
+
48
+ for key , value in zip(document.keys() , document.values()) : data[key] = value
49
+
50
+ milvus_client.insert(
51
+ collection_name = api_key ,
52
+ data = [data]
53
+ )
54
+ chunk_counter += 1
55
+
56
+ if len(documents) > 100_000 : print(f'Warning: Document from {url} had {len(documents)} chunks, but only processed 100_000 ')
57
+
58
+ return ids
59
+
60
+ async def scrape_page_route(
61
+ url : str ,
62
+ api_key : str ,
63
+ logger : Logger ,
64
+ embedding_model : SentenceTransformer ,
65
+ milvus_client : MilvusClient ,
66
+ image_model ,
67
+ url_redis_client : Redis ,
68
+ scrape_images : bool = False
69
+ ) -> None :
70
+
71
+ url_prefix : int = await hash_url(url)
72
+ _ : None = await clean_redis(
73
+ url ,
74
+ url_redis_client ,
75
+ milvus_client ,
76
+ api_key
77
+ )
78
+
79
+ documents : list = await page_to_docs(url , scrape_images , image_model)
80
+
81
+ logger.info(f'Added {len(documents)} for {api_key} for {url}')
82
+
83
+ ids : list = await add_to_milvus(documents , milvus_client , embedding_model , url_prefix , url , api_key)
84
+
85
+ url_redis_client.set(url , json.dumps(ids))
86
+
87
+ async def scrape_pdf_route(
88
+ url : str ,
89
+ api_key : str ,
90
+ logger : Logger ,
91
+ embedding_model : SentenceTransformer ,
92
+ milvus_client : MilvusClient ,
93
+ image_model ,
94
+ url_redis_client : Redis ,
95
+ scrape_images : bool = False
96
+ ) -> None :
97
+
98
+ url_prefix : int = await hash_url(url)
99
+ _ : None = await clean_redis(
100
+ url ,
101
+ url_redis_client ,
102
+ milvus_client ,
103
+ api_key
104
+ )
105
+
106
+ documents : list = await pdf_to_docs(url , scrape_images , image_model)
107
+
108
+ logger.info(f'Added {len(documents)} for {api_key} for {url}')
109
+
110
+ ids : list = await add_to_milvus(documents , milvus_client , embedding_model , url_prefix , url , api_key)
111
+
112
+ url_redis_client.set(url , json.dumps(ids))
113
+
114
+ async def scrape_pdf__file_route(
115
+ filename : str ,
116
+ api_key : str ,
117
+ logger : Logger ,
118
+ contents : bytes ,
119
+ embedding_model : SentenceTransformer ,
120
+ milvus_client : MilvusClient ,
121
+ url_redis_client : Redis ,
122
+ ) -> None :
123
+
124
+ url_prefix : int = await hash_url(filename)
125
+ _ : None = await clean_redis(
126
+ filename ,
127
+ url_redis_client ,
128
+ milvus_client
129
+ )
130
+
131
+ filename = f'assets/pdfs/{filename}'
132
+
133
+ with open(filename , 'wb') as pdf_file : pdf_file.write(contents)
134
+
135
+ documents : list = await pdf_file_to_docs(filename)
136
+
137
+ logger.info(f'Added {len(documents)} for {api_key} for {filename}')
138
+
139
+ ids : list = await add_to_milvus(documents , milvus_client , embedding_model , url_prefix , filename , api_key)
140
+
141
+ url_redis_client.set(filename , json.dumps(ids))
142
+
143
+ async def ask_route(
144
+ query : str ,
145
+ session_id : str ,
146
+ api_key : str ,
147
+ logger : Logger ,
148
+ embedding_model : SentenceTransformer ,
149
+ milvus_client : MilvusClient ,
150
+ chat_redis_client : Redis ,
151
+ groq_client : Groq ,
152
+ ) -> str :
153
+
154
+ query_embeddings = embedding_model.encode(query)
155
+
156
+ results : list = milvus_client.search(
157
+ collection_name = api_key ,
158
+ data = [query_embeddings] ,
159
+ limit = 2 ,
160
+ output_fields = ['text' , 'source']
161
+ )[0]
162
+
163
+ context = '\n'.join([f'''Content : {row['entity']['text']} + {row['entity']['source']}''' for row in results])
164
+
165
+ with open('assets/database/prompt/rag.md') as rag_prompt_file : prompt = rag_prompt_file.read()
166
+
167
+ history : list = await load_history(chat_redis_client , session_id)
168
+
169
+ if history == [] : history = [
170
+ {
171
+ 'role' : 'system' ,
172
+ 'content' : prompt
173
+ }
174
+ ]
175
+
176
+ history.append({
177
+ 'role' : 'user' ,
178
+ 'content' : f'''
179
+ Context : {context}
180
+
181
+ Query : {query}
182
+ '''
183
+ })
184
+
185
+ response : str = await run_groq(history , groq_client)
186
+
187
+ history.append({
188
+ 'role' : 'assistant' ,
189
+ 'content' : response
190
+ })
191
+
192
+ logger.info(f'Answerd : {response} : for {api_key} for {query}')
193
+
194
+ await save_history(chat_redis_client , history , session_id)
195
+
196
+ return response
scripts/routers/services.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ from pymilvus import MilvusClient
3
+ from redis import Redis
4
+ import json
5
+ import os
6
+
7
+ async def hash_url(url : str) -> int :
8
+
9
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8] # 15 hex chars = 60 bits
10
+
11
+ url_prefix = int(url_hash , 16)
12
+
13
+ return url_prefix
14
+
15
+ async def clean_redis(
16
+ url : str ,
17
+ url_redis_client : Redis ,
18
+ milvus_client : MilvusClient ,
19
+ collection_name : str
20
+ ) -> None :
21
+
22
+ existing_ids : list = url_redis_client.get(url)
23
+
24
+ if existing_ids :
25
+
26
+
27
+ existing_ids = json.loads(existing_ids)
28
+
29
+ print(f'Cleaning Milvus for {url} : {len(existing_ids)}')
30
+
31
+ milvus_client.delete(
32
+ collection_name = collection_name,
33
+ filter=f'id in {existing_ids}'
34
+ )
scripts/scrapper/page.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from tqdm import tqdm
3
+
4
+ from urllib.parse import urljoin
5
+
6
+ from scripts.scrapper.services import create_soup
7
+ from scripts.llm.image import caption_image
8
+
9
+ async def get_images(soup , url : str) :
10
+
11
+ img_tags = soup.find_all('img')
12
+
13
+ for img_tag in tqdm(img_tags , total = len(img_tags) , desc = 'Saving Images' , leave = False) :
14
+
15
+ img_url = img_tag.get('src')
16
+
17
+ if img_url : yield urljoin(url , img_url)
18
+
19
+ async def image_to_bytes(img_url : str) -> bytes :
20
+
21
+ img_response = requests.get(img_url , stream = True)
22
+ img_response.raise_for_status()
23
+
24
+ for chunk in img_response.iter_content() : image_bytes += chunk
25
+
26
+ return image_bytes
27
+
28
+ async def page_to_docs(url : str , image_model , scrape_image = False) -> list :
29
+
30
+ documents = []
31
+
32
+ soup = await create_soup(url)
33
+
34
+ text_content = soup.get_text(separator = '\n' , strip = True)
35
+ text_chunks = text_content.split(' ')
36
+
37
+ text_chunks = [' '.join(text_chunks[index : index + 512]) for index in range(0 , len(text_content) , 512)]
38
+
39
+ documents.extend([
40
+ {
41
+ 'type' : 'text' ,
42
+ 'text' : chunk ,
43
+ 'source' : url ,
44
+ 'raw_source' : url ,
45
+ 'type' : 'url'
46
+ } for chunk in text_chunks
47
+ if chunk
48
+ ])
49
+
50
+ if scrape_image :
51
+
52
+ async for img_url in get_images(soup , url) :
53
+
54
+ image_bytes : bytes = await image_to_bytes(img_url)
55
+
56
+ response : str = caption_image(image_bytes , image_model , text_content)
57
+
58
+ documents.append(
59
+ {
60
+ 'type' : 'image' ,
61
+ 'text' : response ,
62
+ 'souce' : img_url ,
63
+ 'raw_source' : image_bytes ,
64
+ 'type' : 'image'
65
+ }
66
+ )
67
+
68
+ return documents
scripts/scrapper/pdf.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ import fitz
4
+ import PyPDF2
5
+ from tqdm import tqdm
6
+ from scripts.llm.image import caption_image
7
+
8
+ async def download_pdf(pdf_link : str) -> str :
9
+
10
+ response = requests.get(pdf_link , stream=True)
11
+ response.raise_for_status()
12
+
13
+ filename = os.path.basename(pdf_link)
14
+ filename = f'assets/pdfs/{filename}'
15
+
16
+ with open(filename , 'wb') as pdf_file : # ! Save in folder
17
+
18
+ for chunk in response.iter_content(chunk_size = 8192) : pdf_file.write(chunk)
19
+
20
+ return filename
21
+
22
+
23
+ async def pdf_to_docs(pdf_link : str , scrape_image : bool , gemini_client) :
24
+
25
+ documents = []
26
+
27
+ pdf_name : str = await download_pdf(pdf_link)
28
+
29
+ py_pdf_object = PyPDF2.PdfReader(pdf_name)
30
+ fi_pdf_object = fitz.open(pdf_name)
31
+
32
+ num_pages = len(py_pdf_object.pages)
33
+
34
+ for page_num in tqdm(range(num_pages) , total = num_pages) :
35
+
36
+ page = py_pdf_object.pages[page_num]
37
+ text = page.extract_text()
38
+
39
+ text_chunks = [text[index : index + 512] for index in range(0 , len(text) , 512)]
40
+
41
+ documents.extend([
42
+ {
43
+ 'type' : 'text' ,
44
+ 'text' : chunk ,
45
+ 'source' : (pdf_link , page_num) ,
46
+ 'raw_source' : pdf_link ,
47
+ 'type' : 'pdf'
48
+ } for chunk in text_chunks
49
+ if chunk
50
+ ])
51
+
52
+ if scrape_image :
53
+
54
+ page = fi_pdf_object.load_page(num_pages)
55
+ images = page.get_images(full = True)
56
+
57
+ for image in tqdm(images , total = len(images)) :
58
+
59
+ image = fi_pdf_object.extract_image(image[0])
60
+ image_bytes : bytes = image['image']
61
+
62
+ response : str = caption_image(image_bytes , gemini_client , text)
63
+
64
+ documents.append(
65
+ {
66
+ 'type' : 'image' ,
67
+ 'text' : response ,
68
+ 'souce' : pdf_name ,
69
+ 'raw_source' : image_bytes ,
70
+ 'type' : 'image'
71
+ }
72
+ )
73
+
74
+ return documents
75
+
76
+ async def pdf_file_to_docs(pdf_name : str) :
77
+
78
+ documents = []
79
+
80
+ py_pdf_object = PyPDF2.PdfReader(pdf_name)
81
+
82
+ num_pages = len(py_pdf_object.pages)
83
+
84
+ for page_num in tqdm(range(num_pages) , total = num_pages) :
85
+
86
+ page = py_pdf_object.pages[page_num]
87
+ text = page.extract_text()
88
+
89
+ text_chunks = [text[index : index + 512] for index in range(0 , len(text) , 512)]
90
+
91
+ documents.extend([
92
+ {
93
+ 'type' : 'text' ,
94
+ 'text' : chunk ,
95
+ 'source' : (pdf_name , page_num) ,
96
+ 'raw_source' : pdf_name ,
97
+ 'type' : 'pdf'
98
+ } for chunk in text_chunks
99
+ if chunk
100
+ ])
101
+
102
+
103
+ return documents
scripts/scrapper/services.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin
4
+ from collections import deque
5
+ from typing import Tuple , List , Set
6
+
7
+ from scripts.services.services import process_link
8
+
9
+ async def create_soup(url : str) -> BeautifulSoup :
10
+
11
+ response = requests.get(url)
12
+ response.raise_for_status()
13
+ soup = BeautifulSoup(response.content , 'html.parser')
14
+
15
+ # print(type(soup))
16
+
17
+ return soup
18
+
19
+ async def process_page(url : str) :
20
+
21
+ soup : BeautifulSoup = await create_soup(url)
22
+
23
+ links = soup.find_all('a' , href = True)
24
+
25
+ for a_tag in links :
26
+
27
+ href = a_tag['href']
28
+
29
+ href = await process_link(href)
30
+
31
+ if href : yield href
32
+
33
+ async def get_pdf_links(base_html : str) :
34
+
35
+ pdf_links : List[str] = []
36
+ all_links : Set[str] = set()
37
+ all_links.add(base_html)
38
+
39
+ visited_urls : set = set()
40
+
41
+ url_queue : deque = deque([base_html])
42
+
43
+ visited_urls.add(base_html)
44
+
45
+ while url_queue :
46
+
47
+ try :
48
+
49
+ current_url : str = url_queue.popleft()
50
+
51
+ print(current_url , len(url_queue))
52
+
53
+ async for link in process_page(current_url) :
54
+
55
+ if link.endswith('pdf') :
56
+
57
+ if not link.startswith('http') : link : str = f'{base_html}{link}'
58
+
59
+ pdf_links.append(link)
60
+
61
+ else :
62
+
63
+ absolute_url : str = urljoin(current_url , link)
64
+
65
+ if absolute_url.startswith(base_html) :
66
+
67
+ if absolute_url not in visited_urls :
68
+
69
+ visited_urls.add(absolute_url)
70
+ all_links.add(absolute_url)
71
+ url_queue.append(absolute_url)
72
+
73
+ except : pass
74
+
75
+ return pdf_links , all_links
scripts/services/services.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ async def process_link(href) :
2
+
3
+ if (
4
+ href and
5
+ isinstance(href , str) and
6
+ not href.startswith('mailto:') and
7
+ not href.endswith('png') and
8
+ not href.endswith('jpg') and
9
+ not href.endswith('jpeg') and
10
+ not href.endswith('gif') and
11
+ not href.endswith('bmp') and
12
+ not href.endswith('tiff') and
13
+ not href.endswith('svg') and
14
+ not href.endswith('webp') and
15
+ not href.endswith('webm') and
16
+ not href.endswith('mp4') # ! Add conditions as needed
17
+ ) : return href
18
+
19
+ return None
20
+
test.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from tqdm import tqdm
4
+ from requests import Response
5
+
6
+ def scrape_url_and_save_in_json_file(url : str , json_file_path : str) -> None :
7
+
8
+ response : Response = requests.post(
9
+ 'http://localhost:8002/scrape-url' ,
10
+ json = {'url' : url}
11
+ )
12
+
13
+ with open(json_file_path , 'w') as json_file : json.dump(response.json() , json_file)
14
+
15
+ def scrape_pdfs(json_file_path : str , api_key : str) -> None :
16
+
17
+ with open(json_file_path) as json_file : links = json.load(json_file)
18
+
19
+ pdf_links = links['pdf_links']
20
+
21
+ for link in tqdm(pdf_links , total = len(pdf_links)) :
22
+
23
+ response : Response = requests.post(
24
+ 'http://localhost:8002/scrape-pdf' ,
25
+ json = {
26
+ 'url' : link ,
27
+ 'api_key' : api_key ,
28
+ 'scrape-images' : False
29
+ }
30
+ )
31
+
32
+ def scrape_page(json_file_path : str , api_key : str) -> None :
33
+
34
+ with open(json_file_path) as json_file : links = json.load(json_file)
35
+
36
+ all_links = links['all_links']
37
+
38
+ for link in tqdm(all_links , total = len(all_links)) :
39
+
40
+ response : Response = requests.post(
41
+ 'http://localhost:8002/scrape-page' ,
42
+ json = {
43
+ 'url' : link ,
44
+ 'api_key' : api_key ,
45
+ 'scrape-images' : False
46
+ }
47
+ )
48
+
49
+ def ask(query , session_id , api_key) -> dict :
50
+
51
+ response : Response = requests.post(
52
+ 'http://localhost:8002/ask' ,
53
+ json = {
54
+ 'query' : query ,
55
+ 'session_id' : session_id ,
56
+ 'api_key' : api_key
57
+ }
58
+ )
59
+
60
+ print(json.dumps(response.json() , indent = 4))
61
+
62
+ return response.json()
63
+
64
+ # scrape_url_and_save_in_json_file('https://www.thetravellerdmc.com/' , 'temp.json')
65
+ scrape_pdfs('temp.json' , 'vk_LANm5E0tSA3bWdN46wW4DVOlJWAMbNoa874BiNSXz7dFIddNzp5I04BNoG2mlI4N')
66
+ scrape_page('temp.json' , 'vk_LANm5E0tSA3bWdN46wW4DVOlJWAMbNoa874BiNSXz7dFIddNzp5I04BNoG2mlI4N')
test.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Root Endpoint (/) - Remains GET as it doesn't take a body
3
+ curl https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/
4
+
5
+ # Scrape URL Endpoint (/scrape-url)
6
+ curl -X POST https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-url \
7
+ -H "Content-Type: application/json" \
8
+ -d '{
9
+ "url": "https://www.nrsc.gov.in/Knowledge_EBooks/",
10
+ }'
11
+
12
+ # Scrape Page Endpoint (/scrape-page)
13
+ curl -X POST https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-page \
14
+ -H "Content-Type: application/json" \
15
+ -d '{
16
+ "url": "https://voicexp.ai/",
17
+ "scrape-images": true
18
+ }'
19
+
20
+ # Scrape PDF Endpoint (/scrape-pdf)
21
+ curl -X POST https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-pdf \
22
+ -H "Content-Type: application/json" \
23
+ -d '{
24
+ "url": "http://example.com/path/to/your/document.pdf",
25
+ "scrape-image": false
26
+ }'
27
+
28
+ curl -X POST "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-pdf-file" \
29
+ -H "Content-Type: multipart/form-data" \
30
+ -F "file=@/path/to/your/document.pdf"
31
+
32
+
33
+ # Ask Endpoint (/ask)
34
+ curl -X POST https://-01j7860s1h540pyys2dz7kcae1fbk9.cloudspaces.litng.ai/ask \
35
+ -H "Content-Type: application/json" \
36
+ -d '{
37
+ "query": "What is the main topic of the ingested documents?",
38
+ "session_id": "user123_chat789"
39
+ }'
40
+
41
+ curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/number-of-queries"
42
+
43
+ curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/sentiment"
44
+
45
+ curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/token-count"
46
+
47
+ curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/category"