Spaces:

ayushsinghal1510
/

sts-llm-backend

Runtime error

App Files Files Community

ayushsinghal1510 commited on Jun 23, 2025

Commit

cc65c1f

1 Parent(s): 60904cd

Init COmmit

Browse files

Files changed (21) hide show

.env.example +13 -0
.gitattributes +8 -0
.gitignore +9 -0
README.md +1 -1
app.py +166 -0
database/prompt/image_ingestion.md +9 -0
database/prompt/rag.md +74 -0
database/vectordb/.temp +0 -0
requirements.txt +27 -0
scripts/llm/image.py +14 -0
scripts/llm/runner.py +11 -0
scripts/llm/services.py +30 -0
scripts/loader/loader.py +82 -0
scripts/routers/routers.py +196 -0
scripts/routers/services.py +34 -0
scripts/scrapper/page.py +68 -0
scripts/scrapper/pdf.py +103 -0
scripts/scrapper/services.py +75 -0
scripts/services/services.py +20 -0
test.py +66 -0
test.sh +47 -0

.env.example ADDED Viewed

	@@ -0,0 +1,13 @@

+MILVUS_DB_NAME=''
+MIVLUS_MODEL_NAME=''
+MIVLUS_VECTOR_SIZE=''
+MILVUS_COLLECTION_NAME=''
+REDIS_HOST=''
+REDIS_PORT=''
+GEMINI_API_KEY=''
+GEMINI_MODEL=''
+GROQ_API_KEY=''
+GROQ_MODEL=''

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+.env
+.pyc
+__pycache__/
+*.py[cod]
+*$py.class
+.db
+.lock

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+*.env
+*.rdb
+*.pyc
+*.pdf
+*.log
+temp.json
+__pycache__/

README.md CHANGED Viewed

@@ -7,4 +7,4 @@ sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import uvicorn
+from logging import Logger
+from dotenv import load_dotenv
+from fastapi import FastAPI , Request , HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from scripts.scrapper.services import get_pdf_links
+from scripts.loader.loader import (
+    load_embedding_model ,
+    load_milvus_client ,
+    load_redis_client ,
+    load_embedding_model ,
+    load_gemini_client ,
+    load_groq_client ,
+    load_logger
+)
+from scripts.routers.routers import (
+    scrape_page_route ,
+    scrape_pdf_route ,
+    ask_route ,
+)
+load_dotenv()
+milvus_client = load_milvus_client()
+chat_redis_client = load_redis_client(0)
+url_redis_client = load_redis_client(2)
+embedding_model = load_embedding_model()
+gemini_client = load_gemini_client()
+groq_client = load_groq_client()
+logger : Logger = load_logger()
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware ,
+    allow_origins = ['*'] ,
+    allow_credentials = True ,
+    allow_methods = ['*'] ,
+    allow_headers = ['*'] ,
+)
+logger.info('Server Startup')
+@app.get('/')
+async def read_root() : return {'Hello' : 'World'}
+@app.post('/scrape-url')
+async def scrape_url(request : Request) -> dict :
+    data : dict = await request.json()
+    if 'url' not in data : raise HTTPException(
+        status_code = 400 ,
+        detail = 'URL was not supplied'
+    )
+    url : str = data['url']
+    pdf_links , all_links = await get_pdf_links(url)
+    return {
+        'pdf_links' : pdf_links ,
+        'all_links' : all_links
+    }
+@app.post('/scrape-page')
+async def scrape_page(request : Request) -> None :
+    data : dict = await request.json()
+    if (
+        'url' not in data or
+        'scrape-images' not in data or
+        'api_key' not in data
+    ) : raise HTTPException(
+        status_code = 400 ,
+        detail = 'Correct Params was not supplied'
+    )
+    url : str = data['url']
+    scrape_images : bool = data['scrape-images']
+    api_key : str = data['api_key']
+    await scrape_page_route(
+        url ,
+        api_key ,
+        logger ,
+        embedding_model ,
+        milvus_client ,
+        gemini_client ,
+        url_redis_client ,
+        scrape_images
+    )
+@app.post('/scrape-pdf')
+async def scrape_pdf(request : Request) -> None :
+    data : dict = await request.json()
+    if (
+        'url' not in data or
+        'scrape-images' not in data or
+        'api_key' not in data
+    ) : raise HTTPException(
+        status_code = 400 ,
+        detail = 'Correct Params was not supplied'
+    )
+    url : str = data['url']
+    scrape_images : bool = data['scrape-images']
+    api_key : str = data['api_key']
+    await scrape_pdf_route(
+        url ,
+        api_key ,
+        logger ,
+        embedding_model ,
+        milvus_client ,
+        gemini_client ,
+        url_redis_client ,
+        scrape_images
+    )
+@app.post('/ask')
+async def ask(request : Request) -> dict :
+    data : dict = await request.json()
+    if (
+        'query' not in data or
+        'session_id' not in data or
+        'api_key' not in data
+    ) : raise HTTPException(
+        status_code = 400 ,
+        detail = 'Correct Params was not supplied'
+    )
+    query : str = data['query']
+    session_id : str = data['session_id']
+    api_key : str = data['api_key']
+    response : str  = await ask_route(
+        query ,
+        session_id ,
+        api_key ,
+        logger ,
+        embedding_model ,
+        milvus_client ,
+        chat_redis_client ,
+        groq_client
+    )
+    return {'response' : response}
+if __name__ == '__main__' : uvicorn.run(
+    'app:app' ,
+    host = '0.0.0.0' ,
+    port = 8002
+)

database/prompt/image_ingestion.md ADDED Viewed

	@@ -0,0 +1,9 @@

+You are a Image Description Expert
+- You will be provided with a image
+- You will be provided with some text sorrounded by it
+- Your task is to provide detailed description of the image
+- Go as detailed as you can
+Sourrounded Text : {}

database/prompt/rag.md ADDED Viewed

	@@ -0,0 +1,74 @@

+**Objective:**
+1.  Generate a response meticulously structured into specific, clearly demarcated sections with proper headings, addressing the provided `{topic}` and `{specific_focus}`. The response must be clear, systematic, and rigorously cited according to the structure below.
+2.  Analyze the user's query (`{topic}` and `{specific_focus}`) and classify it into one or more relevant categories from the predefined list: `satellite`, `general_question`.
+3.  Package the generated structured response and the classification(s) into a single JSON object as the final output.
+**Your Task:**
+First, construct the structured textual response. Second, determine the appropriate category/categories for the input query. Third, combine these into the specified JSON format.
+**Part 1: Structured Response Generation**
+Construct your textual output strictly adhering to the following structure, using the specified Markdown headings precisely as shown. Ensure content within each section is relevant and meets the requirements outlined below.
+**Required Textual Output Structure and Content:**
+## Background
+*   **Heading Requirement:** Use the exact heading `## Background`.
+*   **Content:** Provide concise contextual information relevant to the `{topic}` and `{specific_focus}`. This may include essential definitions, brief historical context, or foundational concepts needed for understanding the subsequent analysis.
+*   **Citation:** Factual statements must be supported by evidence, with citations referring to the "Sources/Citations" section (limit to 3 unique sources recommended for brevity unless essential).
+## Response
+*   **Heading Requirement:** Use the exact heading `## Response`.
+*   **Content & Structure:** This is the core analytical section. Directly address the `{specific_focus}` concerning the `{topic}`.
+    *   **Sub-headings:** **Crucially, use appropriate sub-headings (e.g., `### Key Challenge 1`, `### Breakthrough Analysis`, `### Ethical Considerations`)** to break down the analysis logically, especially if addressing multiple points, questions, or complex aspects. This enhances readability and organization.
+    *   **Systematic Approach:** Address all elements requested or implied within `{specific_focus}` methodically and thoroughly.
+    *   **Clarity:** Use precise language. Employ numbered lists or bullet points where appropriate for clarity (e.g., listing factors, steps, findings).
+    *   **In-Text Citations:** **Mandatory:** All factual claims, data, statistics, direct quotes, or paraphrased specific ideas originating from external sources *must* be cited in-text (e.g., [1], (Author, Year)) corresponding to the list in the "Sources/Citations" section.
+## Sources/Citations
+*   **Heading Requirement:** Use the exact heading `## Sources/Citations`.
+*   **Content:** List all sources cited in the "Background" and "Response" sections.
+*   **Format:** Use a consistent citation style (e.g., APA, MLA, Chicago, Vancouver, Numbered). State the style used if possible. Ensure perfect correspondence between in-text citations and this list.
+**Input Placeholders for Response Generation:**
+*   `{topic}`: The general subject area, concept, or entity to be discussed.
+*   `{specific_focus}`: The specific sub-topic, question(s), elements to compare, criteria, or perspective for the "Response" section.
+**Part 2: Query Classification**
+*   **Analyze:** Based on the provided `{topic}` and `{specific_focus}`, determine the most fitting category/categories.
+*   **Allowed Categories:** You **must** choose from the following list only:
+    * Data Products, Services and Policies
+    * EO Missions
+    * Applications
+    * Remote Sensing and GIS
+    * International Collaboration and Cooperation
+    * General Questions
+*   **Selection:** You can select one or multiple categories if applicable. If unsure or if it doesn't fit well, lean towards `general_question`. Do not use any categories not present in this list.
+**Part 3: Final Output Format**
+*   **Format Requirement:** Your entire output **must** be a single JSON object.
+*   **Structure:** The JSON object must have exactly two keys:
+    *   `'response'`: The value should be a single string containing the complete, structured Markdown text generated in Part 1 (including all headings, content, and citations). Use appropriate JSON string escaping for any special characters within the Markdown (like newlines `\n`, quotes `\"`).
+    *   `'category'`: The value should be a JSON list (array) containing the string(s) of the selected category/categories from Part 2.
+**Example JSON Output Structure:**
+```json
+{
+  "response": "## Background\\n...\\n\\n## Response\\n### Sub-heading 1\\n...\\n\\n## Sources/Citations\\n...",
+  "category": ["satellite"]
+}
+```
+OR
+```json
+{
+  "response": "## Background\\n...\\n\\n## Response\\n### Analysis Point\\n...\\n\\n## Sources/Citations\\n...",
+  "category": ["general_question"]
+}
+```
+**Execution Mandate:** Generate the structured response as described, classify the query using *only* the allowed categories, and return the final result strictly in the specified JSON format. Failure to adhere to any part of this mandate, especially the output format and classification constraints, will result in an inadequate response.

database/vectordb/.temp ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+beautifulsoup4==4.13.4
+docker==7.1.0
+fastapi==0.115.12
+google-genai
+google-generativeai
+google-ai-generativelanguage
+groq==0.23.1
+hiredis==3.1.0
+huggingface-hub==0.30.2
+milvus-lite==2.4.12
+numpy==1.26.4
+pdf2image==1.17.0
+pillow==11.2.1
+pymilvus==2.5.8
+PyMuPDF==1.25.5
+PyPDF2==3.0.1
+python-dotenv==1.1.0
+python-multipart==0.0.20
+redis==6.0.0
+requests==2.32.3
+safetensors==0.5.3
+sentence-transformers==4.1.0
+tokenizers==0.21.1
+tqdm==4.67.1
+transformers==4.49.0
+uvicorn==0.34.2
+wasabi

scripts/llm/image.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import io
+from PIL import Image
+def caption_image(image_bytes : bytes , image_model , surrounding_text : str ) -> str :
+    with open('assets/database/prompt/image_ingestion.md') as image_ingestion_prompt_file : prompt = image_ingestion_prompt_file.read().format(surrounding_text)
+    image = Image.open(io.BytesIO(image_bytes))
+    response = image_model.generate_content([image , prompt])
+    response : str = response.text
+    return response

scripts/llm/runner.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from groq import Groq
+async def run_groq(messages : list , groq_client : Groq, model = 'llama-3.3-70b-versatile') -> str :
+    chat_completion = groq_client.chat.completions.create(
+        messages = messages ,
+        model = model ,
+        response_format = {'type' : 'json_object'}
+    )
+    return chat_completion.choices[0].message.content

scripts/llm/services.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import json
+async def load_history(redis_client , session_id) :
+    # ! ----------------------------only for dev----------------------------
+    # with open('assets/history/chat.json') as chat_file : history = json.load(chat_file)
+    # if session_id in history : return history[session_id]
+    # else : return []
+    # ! --------------------------------------------------------
+    messages = redis_client.get(session_id)
+    if messages : return json.loads(messages)
+    return []
+async def save_history(redis_client , row , session_id) :
+    # ! ----------------------------only for Dev----------------------------
+    # with open('assets/history/chat.json') as chat_file : history = json.load(chat_file)
+    # history[session_id] = row
+    # with open('assets/history/chat.json' , 'w') as chat_file : json.dump(history , chat_file)
+    # ! --------------------------------------------------------
+    redis_client.set(session_id , json.dumps(row))

scripts/loader/loader.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from transformers import pipeline , AutoTokenizer
+from pymilvus import MilvusClient
+import os
+from redis import Redis
+from sentence_transformers import SentenceTransformer
+import google.generativeai as genai
+from pymilvus import Collection
+from groq import Groq
+from transformers.pipelines import Pipeline
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from logging import Logger , getLogger , basicConfig , INFO , FileHandler , Formatter
+def load_logger() -> Logger :
+    if not os.path.exists('assets/logs') : os.makedirs('assets/logs')
+    logger : Logger = getLogger(__name__)
+    basicConfig(
+        level = INFO ,
+        format = '%(asctime)s - %(levelname)s - %(message)s'
+    )
+    file_handler = FileHandler('assets/logs/log.log')
+    file_handler.setLevel(INFO)
+    file_handler.setFormatter(Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+    logger.addHandler(file_handler)
+    return logger
+def load_tokenizer(model_name = 'meta-llama/Meta-Llama-3-8B') -> PreTrainedTokenizerFast :
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return tokenizer
+def load_milvus_client() -> MilvusClient:
+    db_name = os.getenv('MILVUS_DB_NAME' , 'assets/database/vectordb/demo.db')
+    milvus_client = MilvusClient(db_name)
+    return milvus_client
+def load_redis_client(db_name : int) -> Redis :
+    redis_client = Redis(
+        host = os.getenv('REDIS_HOST' , 'localhost') ,
+        port = int(os.getenv('REDIS_PORT' , 6379)) ,
+        db = db_name  ,
+        decode_responses = True
+    )
+    return redis_client
+def load_embedding_model(model_name = '') -> SentenceTransformer :
+    if not model_name : model_name = os.getenv('EMBEDDING_MODEL_NAME' , 'all-MiniLM-L6-v2')
+    embedding_model = SentenceTransformer(model_name)
+    return embedding_model
+def load_gemini_client() :
+    gemini_api_key = os.getenv('GEMINI_API_KEY' , '')
+    if not gemini_api_key : model = ''
+    else :
+        genai.configure(api_key = '<Enter the Gemini API Key here>') # ! Can deploy a Llama 3.2 Model and use that instead, which can increase speed and avoid rate limits and increase safety as well
+        model = genai.GenerativeModel('gemini-1.5-flash')
+    return model
+def load_groq_client() -> Groq :
+    groq_client = Groq()
+    return groq_client

scripts/routers/routers.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import json
+import numpy as np
+from scripts.llm.runner import run_groq
+from scripts.scrapper.page import page_to_docs
+from scripts.routers.services import hash_url , clean_redis
+from scripts.llm.services import save_history , load_history
+from scripts.scrapper.pdf import pdf_to_docs , pdf_file_to_docs
+# ! --------------------------------------Typing Annotations--------------------------------------
+from logging import Logger
+from sentence_transformers import SentenceTransformer
+from pymilvus import MilvusClient
+from redis import Redis
+from groq import Groq
+async def add_to_milvus(
+    documents : list ,
+    milvus_client : MilvusClient ,
+    embedding_model : SentenceTransformer ,
+    url_prefix : int ,
+    url : str ,
+    api_key : str
+) -> list :
+    milvus_client.create_collection(collection_name = api_key , dimension = 384)
+    texts = [document['text'] for document in documents]
+    embeddings : np.ndarray = embedding_model.encode(texts[ : 100_000] , show_progress_bar = True)
+    ids = []
+    chunk_counter = 1
+    for document , embedding in zip(documents , embeddings) :
+        id_ = url_prefix * 100_000 + chunk_counter
+        chunk_counter += 1
+        ids.append(id_)
+        data = {
+            'id' : id_ ,
+            'vector' : embedding
+        }
+        for key , value in zip(document.keys() , document.values()) : data[key] = value
+        milvus_client.insert(
+            collection_name = api_key ,
+            data = [data]
+        )
+        chunk_counter += 1
+    if len(documents) > 100_000 : print(f'Warning: Document from {url} had {len(documents)} chunks, but only processed 100_000 ')
+    return ids
+async def scrape_page_route(
+    url : str ,
+    api_key : str ,
+    logger : Logger ,
+    embedding_model : SentenceTransformer ,
+    milvus_client : MilvusClient ,
+    image_model ,
+    url_redis_client : Redis ,
+    scrape_images : bool = False
+) -> None :
+    url_prefix : int = await hash_url(url)
+    _ : None = await clean_redis(
+        url ,
+        url_redis_client ,
+        milvus_client ,
+        api_key
+    )
+    documents : list = await page_to_docs(url , scrape_images , image_model)
+    logger.info(f'Added {len(documents)} for {api_key} for {url}')
+    ids : list = await add_to_milvus(documents , milvus_client , embedding_model , url_prefix , url , api_key)
+    url_redis_client.set(url , json.dumps(ids))
+async def scrape_pdf_route(
+    url : str ,
+    api_key : str ,
+    logger : Logger ,
+    embedding_model : SentenceTransformer ,
+    milvus_client : MilvusClient ,
+    image_model ,
+    url_redis_client : Redis ,
+    scrape_images : bool = False
+) -> None :
+    url_prefix : int = await hash_url(url)
+    _ : None = await clean_redis(
+        url ,
+        url_redis_client ,
+        milvus_client ,
+        api_key
+    )
+    documents : list = await pdf_to_docs(url , scrape_images , image_model)
+    logger.info(f'Added {len(documents)} for {api_key} for {url}')
+    ids : list = await add_to_milvus(documents , milvus_client , embedding_model , url_prefix , url , api_key)
+    url_redis_client.set(url , json.dumps(ids))
+async def scrape_pdf__file_route(
+    filename : str ,
+    api_key : str ,
+    logger : Logger ,
+    contents : bytes ,
+    embedding_model : SentenceTransformer ,
+    milvus_client : MilvusClient ,
+    url_redis_client : Redis ,
+) -> None :
+    url_prefix : int = await hash_url(filename)
+    _ : None = await clean_redis(
+        filename ,
+        url_redis_client ,
+        milvus_client
+    )
+    filename = f'assets/pdfs/{filename}'
+    with open(filename , 'wb') as pdf_file : pdf_file.write(contents)
+    documents : list = await pdf_file_to_docs(filename)
+    logger.info(f'Added {len(documents)} for {api_key} for {filename}')
+    ids : list = await add_to_milvus(documents , milvus_client , embedding_model , url_prefix , filename , api_key)
+    url_redis_client.set(filename , json.dumps(ids))
+async def ask_route(
+    query : str ,
+    session_id : str ,
+    api_key : str ,
+    logger : Logger ,
+    embedding_model : SentenceTransformer ,
+    milvus_client : MilvusClient ,
+    chat_redis_client : Redis ,
+    groq_client : Groq ,
+) -> str :
+    query_embeddings = embedding_model.encode(query)
+    results : list = milvus_client.search(
+        collection_name = api_key ,
+        data = [query_embeddings] ,
+        limit = 2 ,
+        output_fields = ['text' , 'source']
+    )[0]
+    context = '\n'.join([f'''Content : {row['entity']['text']} + {row['entity']['source']}''' for row in results])
+    with open('assets/database/prompt/rag.md') as rag_prompt_file : prompt = rag_prompt_file.read()
+    history : list = await load_history(chat_redis_client , session_id)
+    if history == [] : history = [
+        {
+            'role' : 'system' ,
+            'content' : prompt
+        }
+    ]
+    history.append({
+        'role' : 'user' ,
+        'content' : f'''
+    Context : {context}
+    Query : {query}
+        '''
+    })
+    response : str  = await run_groq(history , groq_client)
+    history.append({
+        'role' : 'assistant' ,
+        'content' : response
+    })
+    logger.info(f'Answerd : {response} : for {api_key} for {query}')
+    await save_history(chat_redis_client , history , session_id)
+    return response

scripts/routers/services.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import hashlib
+from pymilvus import MilvusClient
+from redis import Redis
+import json
+import os
+async def hash_url(url : str) -> int :
+    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]  # 15 hex chars = 60 bits
+    url_prefix = int(url_hash , 16)
+    return url_prefix
+async def clean_redis(
+    url : str ,
+    url_redis_client : Redis ,
+    milvus_client : MilvusClient ,
+    collection_name : str
+) -> None :
+    existing_ids : list = url_redis_client.get(url)
+    if existing_ids :
+        existing_ids = json.loads(existing_ids)
+        print(f'Cleaning Milvus for {url} : {len(existing_ids)}')
+        milvus_client.delete(
+            collection_name = collection_name,
+            filter=f'id in {existing_ids}'
+        )

scripts/scrapper/page.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import requests
+from tqdm import tqdm
+from urllib.parse import urljoin
+from scripts.scrapper.services import create_soup
+from scripts.llm.image import caption_image
+async def get_images(soup , url : str) :
+    img_tags = soup.find_all('img')
+    for img_tag in tqdm(img_tags , total = len(img_tags) , desc = 'Saving Images' , leave = False) :
+        img_url = img_tag.get('src')
+        if img_url : yield urljoin(url , img_url)
+async def image_to_bytes(img_url : str) -> bytes :
+    img_response = requests.get(img_url , stream = True)
+    img_response.raise_for_status()
+    for chunk in img_response.iter_content() : image_bytes += chunk
+    return image_bytes
+async def page_to_docs(url : str , image_model , scrape_image = False) -> list :
+    documents = []
+    soup = await create_soup(url)
+    text_content = soup.get_text(separator = '\n' , strip = True)
+    text_chunks = text_content.split(' ')
+    text_chunks = [' '.join(text_chunks[index : index + 512]) for index in range(0 , len(text_content) , 512)]
+    documents.extend([
+        {
+            'type' : 'text' ,
+            'text' : chunk ,
+            'source' : url ,
+            'raw_source' : url ,
+            'type' : 'url'
+        } for chunk in text_chunks
+        if chunk
+    ])
+    if scrape_image :
+        async for img_url in get_images(soup , url) :
+            image_bytes : bytes = await image_to_bytes(img_url)
+            response : str = caption_image(image_bytes , image_model , text_content)
+            documents.append(
+                {
+                    'type' : 'image' ,
+                    'text' : response ,
+                    'souce' : img_url ,
+                    'raw_source' : image_bytes ,
+                    'type' : 'image'
+                }
+            )
+    return documents

scripts/scrapper/pdf.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import requests
+import os
+import fitz
+import PyPDF2
+from tqdm import tqdm
+from scripts.llm.image import caption_image
+async def download_pdf(pdf_link : str) -> str :
+    response = requests.get(pdf_link , stream=True)
+    response.raise_for_status()
+    filename = os.path.basename(pdf_link)
+    filename = f'assets/pdfs/{filename}'
+    with open(filename , 'wb') as pdf_file :  # ! Save in folder
+        for chunk in response.iter_content(chunk_size = 8192) : pdf_file.write(chunk)
+    return filename
+async def pdf_to_docs(pdf_link : str , scrape_image : bool , gemini_client) :
+    documents = []
+    pdf_name : str = await download_pdf(pdf_link)
+    py_pdf_object = PyPDF2.PdfReader(pdf_name)
+    fi_pdf_object = fitz.open(pdf_name)
+    num_pages = len(py_pdf_object.pages)
+    for page_num in tqdm(range(num_pages) , total = num_pages) :
+        page = py_pdf_object.pages[page_num]
+        text = page.extract_text()
+        text_chunks = [text[index : index + 512] for index in range(0 , len(text) , 512)]
+        documents.extend([
+            {
+                'type' : 'text' ,
+                'text' : chunk ,
+                'source' : (pdf_link , page_num) ,
+                'raw_source' : pdf_link ,
+                'type' : 'pdf'
+            } for chunk in text_chunks
+            if chunk
+        ])
+        if scrape_image :
+            page = fi_pdf_object.load_page(num_pages)
+            images = page.get_images(full = True)
+            for image in tqdm(images , total = len(images)) :
+                image = fi_pdf_object.extract_image(image[0])
+                image_bytes : bytes = image['image']
+                response : str = caption_image(image_bytes , gemini_client , text)
+                documents.append(
+                    {
+                        'type' : 'image' ,
+                        'text' : response ,
+                        'souce' : pdf_name ,
+                        'raw_source' : image_bytes ,
+                        'type' : 'image'
+                    }
+                )
+    return documents
+async def pdf_file_to_docs(pdf_name : str) :
+    documents = []
+    py_pdf_object = PyPDF2.PdfReader(pdf_name)
+    num_pages = len(py_pdf_object.pages)
+    for page_num in tqdm(range(num_pages) , total = num_pages) :
+        page = py_pdf_object.pages[page_num]
+        text = page.extract_text()
+        text_chunks = [text[index : index + 512] for index in range(0 , len(text) , 512)]
+        documents.extend([
+            {
+                'type' : 'text' ,
+                'text' : chunk ,
+                'source' : (pdf_name , page_num) ,
+                'raw_source' : pdf_name ,
+                'type' : 'pdf'
+            } for chunk in text_chunks
+            if chunk
+        ])
+    return documents

scripts/scrapper/services.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from collections import deque
+from typing import Tuple , List , Set
+from scripts.services.services import process_link
+async def create_soup(url : str) -> BeautifulSoup :
+    response = requests.get(url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content , 'html.parser')
+    # print(type(soup))
+    return soup
+async def process_page(url : str) :
+    soup : BeautifulSoup = await create_soup(url)
+    links = soup.find_all('a' , href = True)
+    for a_tag in links :
+        href = a_tag['href']
+        href = await process_link(href)
+        if href : yield href
+async def get_pdf_links(base_html : str) :
+    pdf_links : List[str] = []
+    all_links : Set[str] = set()
+    all_links.add(base_html)
+    visited_urls : set = set()
+    url_queue : deque = deque([base_html])
+    visited_urls.add(base_html)
+    while url_queue :
+        try :
+            current_url : str = url_queue.popleft()
+            print(current_url , len(url_queue))
+            async for link in process_page(current_url) :
+                if link.endswith('pdf') :
+                    if not link.startswith('http') : link : str = f'{base_html}{link}'
+                    pdf_links.append(link)
+                else :
+                    absolute_url : str = urljoin(current_url , link)
+                    if absolute_url.startswith(base_html) :
+                        if absolute_url not in visited_urls :
+                            visited_urls.add(absolute_url)
+                            all_links.add(absolute_url)
+                            url_queue.append(absolute_url)
+        except : pass
+    return pdf_links , all_links

scripts/services/services.py ADDED Viewed

	@@ -0,0 +1,20 @@

+async def process_link(href) :
+    if (
+        href and
+        isinstance(href , str) and
+        not href.startswith('mailto:') and
+        not href.endswith('png') and
+        not href.endswith('jpg') and
+        not href.endswith('jpeg') and
+        not href.endswith('gif') and
+        not href.endswith('bmp') and
+        not href.endswith('tiff') and
+        not href.endswith('svg') and
+        not href.endswith('webp') and
+        not href.endswith('webm') and
+        not href.endswith('mp4') # ! Add conditions as needed
+    ) : return href
+    return None

test.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import requests
+import json
+from tqdm import tqdm
+from requests import Response
+def scrape_url_and_save_in_json_file(url : str , json_file_path : str) -> None  :
+    response : Response = requests.post(
+        'http://localhost:8002/scrape-url' ,
+        json = {'url' : url}
+    )
+    with open(json_file_path , 'w') as json_file : json.dump(response.json() , json_file)
+def scrape_pdfs(json_file_path : str , api_key : str) -> None :
+    with open(json_file_path) as json_file : links = json.load(json_file)
+    pdf_links = links['pdf_links']
+    for link in tqdm(pdf_links , total = len(pdf_links)) :
+        response : Response = requests.post(
+            'http://localhost:8002/scrape-pdf' ,
+            json = {
+                'url' : link ,
+                'api_key' : api_key ,
+                'scrape-images' : False
+            }
+        )
+def scrape_page(json_file_path : str , api_key : str) -> None :
+    with open(json_file_path) as json_file : links = json.load(json_file)
+    all_links = links['all_links']
+    for link in tqdm(all_links , total = len(all_links)) :
+        response : Response = requests.post(
+            'http://localhost:8002/scrape-page' ,
+            json = {
+                'url' : link ,
+                'api_key' : api_key ,
+                'scrape-images' : False
+            }
+        )
+def ask(query , session_id , api_key) -> dict :
+    response : Response = requests.post(
+        'http://localhost:8002/ask' ,
+        json = {
+            'query' : query ,
+            'session_id' : session_id ,
+            'api_key' : api_key
+        }
+    )
+    print(json.dumps(response.json() , indent = 4))
+    return response.json()
+# scrape_url_and_save_in_json_file('https://www.thetravellerdmc.com/' , 'temp.json')
+scrape_pdfs('temp.json' , 'vk_LANm5E0tSA3bWdN46wW4DVOlJWAMbNoa874BiNSXz7dFIddNzp5I04BNoG2mlI4N')
+scrape_page('temp.json' , 'vk_LANm5E0tSA3bWdN46wW4DVOlJWAMbNoa874BiNSXz7dFIddNzp5I04BNoG2mlI4N')

test.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+# Root Endpoint (/) - Remains GET as it doesn't take a body
+curl https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/
+# Scrape URL Endpoint (/scrape-url)
+curl -X POST https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-url \
+-H "Content-Type: application/json" \
+-d '{
+        "url": "https://www.nrsc.gov.in/Knowledge_EBooks/",
+    }'
+# Scrape Page Endpoint (/scrape-page)
+curl -X POST https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-page \
+-H "Content-Type: application/json" \
+-d '{
+        "url": "https://voicexp.ai/",
+        "scrape-images": true
+    }'
+# Scrape PDF Endpoint (/scrape-pdf)
+curl -X POST https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-pdf \
+-H "Content-Type: application/json" \
+-d '{
+        "url": "http://example.com/path/to/your/document.pdf",
+        "scrape-image": false
+    }'
+curl -X POST "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/scrape-pdf-file" \
+    -H "Content-Type: multipart/form-data" \
+    -F "file=@/path/to/your/document.pdf"
+# Ask Endpoint (/ask)
+curl -X POST https://-01j7860s1h540pyys2dz7kcae1fbk9.cloudspaces.litng.ai/ask \
+-H "Content-Type: application/json" \
+-d '{
+        "query": "What is the main topic of the ingested documents?",
+        "session_id": "user123_chat789"
+    }'
+curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/number-of-queries"
+curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/sentiment"
+curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/token-count"
+curl -X GET "https://8888-01jvz3v9phphmvq0twsmakz8zy.cloudspaces.litng.ai/category"