Spaces:

ashishbangwal
/

Investor-API

Build error

App Files Files Community

ashishbangwal commited on Sep 16, 2024

Commit

206ef5f

1 Parent(s): 9b50757

latest updates to investor agent [UI + backend-logic change]

Browse files

Files changed (11) hide show

main.py +77 -903
requirements.txt +121 -43
utils/.env +4 -0
classifier.py → utils/ChartClassifier.py +1 -1
utils/HelperFunctions.py +99 -0
utils/HyDE.bin +3 -0
utils/ModelCallingFunctions.py +195 -0
utils/PdfUtils.py +103 -0
utils/VectorDatabase.py +65 -0
graph_classifierV2_B.onnx → utils/graph_classifierV2_B.onnx +0 -0
utils/prompts.py +55 -0

main.py CHANGED Viewed

@@ -1,910 +1,84 @@
-import os
-import uuid
-import psycopg2
-import time
-import re
-import asyncio
-import cohere
-import numpy
-import streamlit as st
-import pdfkit
-import json
-import requests
-import tempfile
-import mistune
-import markdown as md
-import psycopg2
-import html2text
-from typing import List, Tuple, Dict
-from pinecone import Pinecone, ServerlessSpec
-import openai
-import os
-import pymupdf
-import tiktoken
-import google.generativeai as gemini
-from PIL import Image
-from PIL import PngImagePlugin  # important to avoid google_genai AttributeError
-import json
-import hashlib
-from dotenv import load_dotenv
-from classifier import Classifier
-from tenacity import retry, stop_after_attempt, wait_random_exponential
-from fastapi import FastAPI, HTTPException, UploadFile, File, Form
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi_cache import FastAPICache
-from fastapi_cache.backends.inmemory import InMemoryBackend
-from fastapi_cache.decorator import cache
-import aiohttp
-app = FastAPI()
-@app.on_event("startup")
-async def startup():
-    FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache")
-load_dotenv()
-TOGETHER_API_KEY = os.getenv("TOGETHER_API")
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-COHERE_API = os.getenv("COHERE_API")
-PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-gemini.configure(api_key=GEMINI_API_KEY)
-client = openai.OpenAI(base_url="https://api.together.xyz/v1", api_key=TOGETHER_API_KEY)
-SysPromptDefault = "You are now in the role of an expert AI."
-GenerationPrompt = """You are an expert AI whose task is to ANSWER the user's QUESTION using the provided CONTEXT.
-Forget everything you know, Fully rely on CONTEXT to provide the answer.
-Follow these steps:
-1. Think deeply and multiple times about the user's QUESTION. You must understand the intent of their question and provide the most appropriate answer.
-2. Choose the most relevant content from the CONTEXT that addresses the user's question and use it to generate an answer.
-Formating Instructions:
-Respond only in markdown format; don't use big headings"""
-QuestionRouter = """ You are an expert investor, You must identify if the provided CONTEXT can answer the user QUESTION.
-1 vectorstore : The provided CONTEXT is sufficient to answer the question.
-2 missing_information : The provided CONTEXT does not contains the answer.
-output options: 'vectorstore' OR 'missing_information'.The output must be a valid JSON.Do not add any additional comments.
-Output format:
-{
-    "datasource":"identified option"
-}
-Return the a valid JSON with a single key 'datasource' and no preamble or explanation. Question to identify: QUESTION """
-MissingInformation = """You are an expert in identifying missing information in a given CONTEXT to answer a QUESTION effectively. Your task is to analyze the CONTEXT, and pinpoint the missing content needed for each QUESTION. Take your time to process the information thoroughly and provide a list output without any additional comments. The output format should be valid markdown list , without any additional comments:
-"""
-SummaryPrompt = """You are an expert AI specializing in document summarization. You have been refining your skills in text summarization and data extraction for over a decade, ensuring the highest accuracy and clarity. Your task is to read raw data from a PDF file page by page and provide a detailed summary of the CONTEXT while ensuring all numerical data is included in the summary without alteration. The output should be in Markdown format, with appropriate headers and lists to enhance readability. Follow these instructions:
-1.Summarize the Text: Provide a concise summary of the CONTEXT, capturing the main points and essential information.
-2.Retain Numerical Data: Ensure all numerical data (e.g., dates, statistics, financial figures, percentages, measurements) is included in the summary.
-3.Markdown Format: Format the output in Markdown, using headers, lists, and other Markdown elements appropriately.
-Note: Whenever the CONTEXT is about a TEAM, DO NOT summarize; instead, output the content in a neat markdown format with Names and previous designation of the TEAM.
-"""
-IndustryPrompt = """You are a business strategy consultant. You have been identifying niche markets and industries for companies across various sectors for over 20 years. Your expertise lies in analyzing detailed CONTEXT to accurately pinpoint the niche and industry of a business.
-Objective: Identify the niche and industry of a business by analyzing the provided CONTEXT.
-Steps to follow:
-Read the context: Carefully read the provided information to understand the business's products, services, target audience, and unique value propositions.
-Determine the industry: Based on the provided CONTEXT, identify the primary industry to which the business belongs. Consider factors such as the type of products/services offered, the market served, and industry-specific terminology.
-Identify the niche: Analyze the details to pinpoint the specific niche within the industry. Look for unique aspects of the business, specialized market segments, or specific customer needs that the business addresses.
-Provide output in JSON format: Clearly state the identified industry and niche in a JSON format. Ensure your reasoning supports the identified industry and niche.The output should JSON ,Do not add any additional format.
-Output format:
-{
-  "industry": "Identified industry here",
-  "niche": "Identified niche here",
-  "reasoning": "Explanation of how the industry and niche were identified based on the context"
-}
-Take a deep breath and work on this problem step-by-step.
-"""
-Investment = """You are a professional financial analyst evaluating sectors for investment potential. Your task is to:
-1. Identify the sector from the provided CONTEXT.
-2. Grade only the specified KEYS on a scale of 1-10, with higher grades indicating better investment potential. Take a conservative approach in grading.
-3. Provide reasoning for each grade considering both qualitative and quantitative factors, including the FUNDING information.
-4. Assign weights to each section (total should equal 1).
-5. Calculate an overall weighted score.
-Use only the information given in the CONTEXT and the FUNDING provided. Be conservative in your grading to reflect investment risks. Output your analysis in the following JSON format:
-```json
-{
-  "sector": "Sector name",
-  "sections": [
-    {
-      "section": "Key from Context",
-      "score": "Grade (1-10)",
-      "weight": "Weight (0-1)",
-      "reasoning": "Detailed analysis including funding considerations"
-    }
-  ],
-  "overall_score": "Calculated weighted score"
-}
-```
-Grade only these KEYS from the CONTEXT:
-1. "What are the company's projected revenue, expenses, and profits for the future and cash flow projections?"
-2. "What is the founder's experience and track record, along with the key team members' bios, background checks, and their roles and responsibilities?"
-3. "How does the company's product/service differentiate itself from competitors in the market?"
-4. "What issue or challenge is the company addressing?"
-5. "Risks Involved"
-6. "Barrier To Entry"
-7. "Competitors"
-8. "Challenges"
-Additionally, consider the FUNDING provided by the user:
-- The FUNDING will be classified as follows:
-  * Low: Less than $1 million
-  * Medium: $1 million to $10 million
-  * High: More than $10 million
-- Adjust your scoring based on the funding classification:
-  * For Low funding: Reduce scores by 1-2 points where relevant.
-  * For Medium funding: Keep scores as they would be without considering funding.
-  * For High funding: Increase scores by 1-2 points where relevant.
-- Incorporate the funding information into your analysis of each relevant key.
-- Consider how the funding level impacts various aspects such as projected financials, ability to execute plans, competitive positioning, and risk mitigation.
-- Reflect the impact of funding in your scoring and reasoning for each relevant key.
--Don't explicitly mention the original funding in the answer but use them to give reasoning.
-Provide your analysis based on the CONTEXT and FUNDING that will be given.
-"""
-queries = [
-    "What is the company's product/service, and what are its key features?",
-    "Who is the target customer for the company's product/service, and what problem does it solve for them?",
-    "What are the company's revenue streams?",
-    "How does the company price its products/services?"
-    "What are the key cost drivers and profit margins for the company?",
-    "What opportunities for growth and expansion does the company foresee?",
-    "Who is the target market for the company's product/service, and how does the company plan to reach them?",
-    "What sales channels and distribution partnerships does the company have in place?",
-    "How is the company's marketing budget allocated?",
-    "What is the company's historical financial performance, including growth rate?",
-    "What are the company's projected revenue, expenses, and profits for the future and cash flow projections?",
-    "What is the founder's experience and track record, along with the key team members' bios, background checks, and their roles and responsibilities?",
-    "How does the company's product/service differentiate itself from competitors in the market?",
-    "What issue or challenge is the company addressing?",
-]
-document_processing_event = asyncio.Event()
-document_processing_event.set()
-def get_digest(pdf_content):
-    h = hashlib.sha256()
-    h.update(pdf_content)  # Hash the binary content of the PDF
-    return h.hexdigest()
-def response(
-    message: object,
-    model: object = "meta-llama/llama-3-70b-instruct:nitro",
-    SysPrompt: object = SysPromptDefault,
-    temperature: object = 0.2,
-) -> object:
-    """
-    :rtype: object
-    """
-    client = openai.OpenAI(
-        api_key=OPENROUTER_API_KEY,
-        base_url="https://openrouter.ai/api/v1",
-    )
-    messages = [
-        {"role": "system", "content": SysPrompt},
-        {"role": "user", "content": message},
-    ]
-    @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
-    def completion_with_backoff(**kwargs):
-        print("RETRY")
-        return client.chat.completions.create(**kwargs)
-    try:
-        response = completion_with_backoff(
-            model=model,
-            messages=messages,
-            temperature=temperature,
-            frequency_penalty=0.2,
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        print(f"An error occurred: {e}")
-def number_of_tokens(texts: List[str]) -> List[int]:
-    """
-    Calculate the number of tokens in a batch of strings.
-    """
-    model = tiktoken.encoding_for_model("gpt-3.5-turbo")
-    encodings = model.encode_batch(texts)
-    num_of_tokens = [len(encoding) for encoding in encodings]
-    return num_of_tokens
-def limit_tokens(input_string, token_limit=5500):
-    """
-    Limit tokens sent to the model
-    """
-    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
-    return encoding.decode(encoding.encode(input_string)[:token_limit])
-def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
-    "Takes image path and extract information from it, and return it as text."
-    # Start Classifier inference session
-    classifier = Classifier("graph_classifierV2_B.onnx")
-    # Model for img to text
-    model = gemini.GenerativeModel("gemini-1.5-flash")
-    description_prompt = f"You are provided with the images extracted from a pitch-deck and some text surrounding the image from the same pitch deck. Extract all the factual information that the image is trying to communicate through line charts, area line charts, bar charts, pie charts, tables exectra. Use OCR to extract numerical figures and include them in the information. If the image does not have any information like its a blank image or image of a person then response should be NOTHING. Do not add any additional comments or markdown, just give information. \n\n SURROUNDING TEXT \n\n{text}"
-    img_list = []
-    for pixmap in pixmap_list:
-        try:
-            img_list.append(
-                Image.frombytes(
-                    mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
-                )
-            )
-        except Exception as e:
-            print(e)
-    graph_image = classifier.classify(img_list)
-    print(graph_image)
-    response_list = []
-    for idx, is_graph in enumerate(graph_image):
-        if is_graph:
-            response = model.generate_content(
-                [description_prompt, img_list[idx]], stream=False
-            )
-            print("\n\n", response.text, "\n\n")
-            response_list.append(str(response.text))
-    return response_list
-def extract_content(pdf_content: bytes) -> List[Tuple[str, int]]:
-    """
-    Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
-    and page number containing that text.
-    """
-    print("Extract content called ")
-    pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")
-    pages_content = []
-    refered_xref = []
-    for page_number in range(pdf_doc.page_count):
-        page_content = ""
-        # extracting text content
-        page = pdf_doc.load_page(page_number)
-        text_content = str(page.get_text()).replace("\n", "\t")
-        page_content += text_content
-        # extracting image content
-        image_list = page.get_image_info(xrefs=True)
-        pixmap_list = []
-        for img_info in image_list:
-            xref = img_info["xref"]
-            if xref not in refered_xref:
-                # if xref not in refered_xref:
-                try:
-                    img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
-                    pixmap_list.append(img_pixmap)
-                    refered_xref.append(xref)
-                except ValueError as e:
-                    print(f"Skipping image with due to error: {e}")
-        if len(pixmap_list) > 0:
-            img_content = extract_image_content(
-                pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
-            )
-            page_content = page_content + "\n\n" + "\n\n".join(img_content)
-        pages_content.append(page_content)
-    num_tokens = number_of_tokens(pages_content)
-    final_data = []
-    # Logic to handle case when page content > 512 tokens
-    for e, n_token in enumerate(num_tokens):
-        if n_token > 500:
-            n_parts = numpy.ceil(n_token / 500).astype(int)
-            len_content = len(pages_content[e])
-            part_size = len_content // n_parts
-            start, end = 0, part_size
-            temp = []
-            for _ in range(n_parts):
-                temp.append((pages_content[e][start:end], e + 1))
-                start = end
-                end = end + part_size
-            final_data += temp
-        else:
-            final_data.append((pages_content[e], e + 1))
-    pdf_doc.close()
-    print(final_data)
-    return final_data
-def markdown(output):
-    report_html = output.get("report", "")
-    references = output.get("references", {})
-    references_markdown = ""
-    for url, content in references.items():
-        # Making the URL clickable in pure HTML
-        clickable_url = f'<a href="{url}">{url}</a>'
-        references_markdown += f"<details><summary>{clickable_url}</summary>\n\n{html2text.html2text(content)}</details>\n\n"
-    combined_markdown = ""
-    if report_html.strip():  # Check if report_html is not empty
-        # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily
-        report_markdown = html2text.html2text(report_html)
-        # Remove unwanted newlines within Markdown headings
-        report_markdown = report_markdown.replace("\n", " ").replace("  ", "\n")
-        combined_markdown += report_markdown + "\n\n"
-    combined_markdown += references_markdown
-    return combined_markdown
-def pinecone_server():
-    pc = Pinecone(api_key=PINECONE_API_KEY)
-    index_name = "investment"
-    if index_name not in pc.list_indexes().names():
-        pc.create_index(
-            index_name,
-            dimension=1024,
-            metric="cosine",
-            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
-        )
-        time.sleep(1)
-    index = pc.Index(index_name)
-    index.describe_index_stats()
-    return index
-def fetch_vectorstore_from_db(file_id):
-    conn = psycopg2.connect(
-        dbname="postgres",
-        user="postgres.vjbkvfmqsaebxlnvvjtm",
-        password="FPaN3iV1fuWteBON",
-        host="aws-0-ap-south-1.pooler.supabase.com",
-        port="6543",
-    )
-    cur = conn.cursor()
-    create_table_query = """
-        CREATE TABLE IF NOT EXISTS investment_research_pro (
-            file_id VARCHAR(1024) PRIMARY KEY,
-            file_name VARCHAR(1024),
-            name_space VARCHAR(1024)
-        );
-    """
-    cur.execute(create_table_query)
-    conn.commit()
-    fetch_query = """
-    SELECT name_space
-    FROM investment_research_pro
-    WHERE file_id = %s;
-    """
-    cur.execute(fetch_query, (file_id,))
-    result = cur.fetchone()
-    cur.close()
-    conn.close()
-    if result:
-        return result[0]
-    return None
-def get_next_namespace():
-    conn = psycopg2.connect(
-        dbname="postgres",
-        user="postgres.vjbkvfmqsaebxlnvvjtm",
-        password="FPaN3iV1fuWteBON",
-        host="aws-0-ap-south-1.pooler.supabase.com",
-        port="6543",
-    )
-    cur = conn.cursor()
-    cur.execute("SELECT COUNT(*) FROM investment_research_pro")
-    count = cur.fetchone()[0]
-    next_namespace = f"pdf-{count + 1}"
-    cur.close()
-    conn.close()
-    return next_namespace
-def insert_data(file_id, file_name, name_space):
-    print("inserted")
-    conn = psycopg2.connect(
-        dbname="postgres",
-        user="postgres.vjbkvfmqsaebxlnvvjtm",
-        password="FPaN3iV1fuWteBON",
-        host="aws-0-ap-south-1.pooler.supabase.com",
-        port="6543",
-    )
-    cur = conn.cursor()
-    create_table_query = """
-    CREATE TABLE IF NOT EXISTS investment_research_pro (
-        file_id VARCHAR(1024) PRIMARY KEY,
-        file_name VARCHAR(1024),
-        name_space VARCHAR(255)
-    );
-    """
-    cur.execute(create_table_query)
-    conn.commit()
-    insert_query = """
-    INSERT INTO investment_research_pro (file_id, file_name, name_space)
-    VALUES (%s, %s, %s)
-    ON CONFLICT (file_id) DO NOTHING;
-    """
-    cur.execute(insert_query, (file_id, file_name, name_space))
-    conn.commit()
-    cur.close()
-    conn.close()
-def create_documents(page_contents):
-    documents = []
-    for content, page_number in page_contents:
-        doc = {
-            "page_content": content,
-            "metadata": {"page_number": page_number, "original_content": content},
         }
-        documents.append(doc)
-    return documents
-def embed_and_upsert(documents, name_space):
-    chunks = [doc["page_content"] for doc in documents]
-    pinecone_index = pinecone_server()
-    embeddings_response = client.embeddings.create(
-        input=chunks, model="BAAI/bge-large-en-v1.5"
-    ).data
-    embeddings = [i.embedding for i in embeddings_response]
-    pinecone_data = []
-    for doc, embedding in zip(documents, embeddings):
-        i = str(uuid.uuid4())
-        pinecone_data.append(
-            {"id": i, "values": embedding, "metadata": doc["metadata"]}
-        )
-    pinecone_index.upsert(vectors=pinecone_data, namespace=name_space)
-def embedding_creation(pdf_content, name_space):
-    data = extract_content(pdf_content)
-    # text_data = [i[0] for i in data]
-    documents = create_documents(data)
-    embed_and_upsert(documents, name_space)
-    print("Embeddings created and upserted successfully into Pinecone.")
-def embed(question):
-    embeddings_response = client.embeddings.create(
-        input=[question],
-        model="BAAI/bge-large-en-v1.5",
-    ).data
-    embeddings = embeddings_response[0].embedding
-    return embeddings
-def process_rerank_response(rerank_response, docs):
-    rerank_docs = []
-    for item in rerank_response.results:
-        index = item.index
-        if 0 <= index < len(docs):
-            rerank_docs.append(docs[index])
-        else:
-            print(f"Warning: Index {index} is out of range for documents list.")
-    return rerank_docs
-async def get_docs(question, pdf_content, file_name):
-    global document_processing_event
-    index = pinecone_server()
-    co = cohere.Client(COHERE_API)
-    xq = embed(question)
-    await document_processing_event.wait()
-    file_id = get_digest(pdf_content)
-    existing_namespace = fetch_vectorstore_from_db(file_id)
-    if existing_namespace:
-        print("Document already exists. Using existing namespace.")
-        name_space = existing_namespace
     else:
-        document_processing_event.clear()
-        print("evet stopped")
-        print("Document is new. Creating embeddings and new namespace.")
-        name_space = get_next_namespace()
-        print(name_space)
-        embedding_creation(pdf_content, name_space)
-        insert_data(file_id, file_name, name_space)
-        print("Sleep complete....")
-        # except Exception as e:
-        #    print(e)
-        # finally:
-        print("finally called")
-        document_processing_event.set()
-    # Query is now inside the lock to ensure it happens after any new document processing
-    res = index.query(namespace=name_space, vector=xq, top_k=5, include_metadata=True)
-    print(res)
-    docs = [x["metadata"]["original_content"] for x in res["matches"]]
-    if not docs:
-        print("No matching documents found.")
-        return []
-    results = co.rerank(
-        query=question, documents=docs, top_n=3, model="rerank-english-v3.0"
-    )
-    reranked_docs = process_rerank_response(results, docs)
-    return reranked_docs
-async def industry(pdf_content, file_name):
-    question = (
-        "What is the name and its specific niche business this document pertains to."
-    )
-    docs = await get_docs(question, pdf_content, file_name)
-    context = "\n\n".join(docs)
-    message = f"CONTEXT\n\n{context}\n\n"
-    model = "meta-llama/llama-3-70b-instruct:nitro"
-    response_str = response(
-        message=message, model=model, SysPrompt=IndustryPrompt, temperature=0
-    )
-    industry = json.loads(response_str)
-    print(industry)
-    return industry
-def split_into_chunks(input_string, token_limit=4500):
-    # Initialize the tokenizer for the model
-    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
-    # Encode the input string to get the tokens
-    tokens = encoding.encode(input_string)
-    # List to store chunks
-    chunks = []
-    start = 0
-    # Iterate over the tokens and split into chunks
-    while start < len(tokens):
-        end = start + token_limit
-        if end >= len(tokens):
-            chunk_tokens = tokens[start:]
-        else:
-            break_point = end
-            while break_point > start and tokens[break_point] not in encoding.encode(
-                " "
-            ):
-                break_point -= 1
-            if break_point == start:
-                chunk_tokens = tokens[start:end]
-            else:
-                chunk_tokens = tokens[start:break_point]
-                end = break_point
-        chunk = encoding.decode(chunk_tokens)
-        chunks.append(chunk)
-        start = end
-    return chunks
-def further_split_chunk(chunk, token_limit):
-    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
-    tokens = encoding.encode(chunk)
-    sub_chunks = []
-    start = 0
-    while start < len(tokens):
-        end = start + token_limit
-        if end >= len(tokens):
-            sub_chunk_tokens = tokens[start:]
-        else:
-            break_point = end
-            while break_point > start and tokens[break_point] not in encoding.encode(
-                " "
-            ):
-                break_point -= 1
-            if break_point == start:
-                sub_chunk_tokens = tokens[start:end]
-            else:
-                sub_chunk_tokens = tokens[start:break_point]
-                end = break_point
-        sub_chunk = encoding.decode(sub_chunk_tokens)
-        sub_chunks.append(sub_chunk)
-        start = end
-    return sub_chunks
-# Define the investment function
-def investment(queries, query_results, other_info_results, Funding):
-    # Combine queries and query_results into a dictionary
-    combined_results = {q: r for q, r in zip(queries[-4:], query_results[-4:])}
-    # Extract keys and answers from the other_info_results and update the combined_results dictionary
-    for key, value in other_info_results.items():
-        if isinstance(value, str):  # Check if the value is a string
-            combined_results[key] = value.split("<details><summary>")[0].strip()
-        else:
-            combined_results[key] = value
-    print(combined_results)
-    message = f"CONTEXT:\n\n{json.dumps(combined_results, indent=4)}\n\nFUNDING:\n\n{Funding}\n\n"
-    sys_prompt = Investment
-    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
-    sys_prompt_token_size = len(encoding.encode(sys_prompt))
-    max_model_tokens = 7000
-    max_chunk_size = 7000  # Adjust to leave more buffer space
-    chunks = split_into_chunks(message, token_limit=max_chunk_size)
-    model = "anthropic/claude-3.5-sonnet"
-    responses = []
-    tokens_used = 0
-    max_tokens_per_minute = 7000
-    for chunk in chunks:
-        chunk_token_size = len(encoding.encode(chunk))
-        combined_message = f"{sys_prompt}\n{chunk}"
-        combined_token_size = len(encoding.encode(combined_message))
-        print(
-            f"Token size of the combined message and SysPrompt for this chunk: {combined_token_size}"
-        )
-        print(f"Chunk token size: {chunk_token_size}")
-        print(f"SysPrompt token size: {sys_prompt_token_size}")
-        if combined_token_size > max_model_tokens:
-            print(
-                f"Warning: Combined token size ({combined_token_size}) exceeds the model's limit ({max_model_tokens}). Adjusting chunk size."
-            )
-            sub_chunks = further_split_chunk(
-                chunk, max_model_tokens - sys_prompt_token_size
-            )
-            for sub_chunk in sub_chunks:
-                sub_chunk_token_size = len(encoding.encode(sub_chunk))
-                print(sub_chunk_token_size)
-                if sub_chunk_token_size > 500:
-                    sub_combined_message = f"{sys_prompt}\n{sub_chunk}"
-                    sub_combined_token_size = len(encoding.encode(sub_combined_message))
-                    if sub_combined_token_size <= max_model_tokens:
-                        response_str = response(
-                            message=sub_chunk,
-                            model=model,
-                            SysPrompt=sys_prompt,
-                            temperature=0,
-                        )
-                        print(response_str)
-                        json_part = extract_json(response_str)
-                        if json_part:
-                            responses.append(json_part)
-                        else:
-                            print("Warning: No valid JSON part found in the response.")
-                        tokens_used += sub_combined_token_size
-                        if tokens_used >= max_tokens_per_minute:
-                            print("Waiting for 60 seconds to avoid rate limit.")
-                            time.sleep(60)
-                            tokens_used = 0
-        else:
-            if chunk_token_size >= 500:
-                response_str = response(
-                    message=chunk, model=model, SysPrompt=sys_prompt, temperature=0
-                )
-                print(response_str)
-                json_part = extract_json(response_str)
-                if json_part:
-                    responses.append(json_part)
-                else:
-                    print("Warning: No valid JSON part found in the response.")
-                tokens_used += combined_token_size
-                if tokens_used >= max_tokens_per_minute:
-                    print("Waiting for 60 seconds to avoid rate limit.")
-                    time.sleep(60)
-                    tokens_used = 0
-    combined_json = {"sectors": [], "final_score": 0}
-    total_score = 0
-    count = 0
-    for response_str in responses:
-        response_json = json.loads(response_str)
-        combined_json["sectors"].append(response_json)
-        total_score += response_json["overall_score"]
-        count += 1
-    if count > 0:
-        combined_json["final_score"] = total_score / count
-    final_json = json.dumps(combined_json, indent=4)
-    print(final_json)
-    return final_json
-def extract_json(response_str):
-    """Extract the JSON part from the response string."""
-    match = re.search(r"\{.*}", response_str, re.DOTALL)
-    if match:
-        json_part = match.group()
-        try:
-            json.loads(json_part)  # Check if it's valid JSON
-            return json_part
-        except json.JSONDecodeError:
-            print("Invalid JSON detected.")
-    return None
-async def answer(client, question, pdf_content, file_name):
-    docs = await get_docs(question, pdf_content, file_name)
-    context = "\n\n".join(docs)
-    message = f"CONTEXT:\n\n{context}\n\nQUESTION :\n\n{question}\n\n"
-    model = "meta-llama/llama-3-70b-instruct:nitro"
-    messages = [
-        {"role": "system", "content": QuestionRouter},
-        {"role": "user", "content": message},
-    ]
-    response_str = await client.chat.completions.create(
-        messages=messages, model=model, temperature=0
-    )
-    print(response_str)
-    source = json.loads(response_str.choices[0].message.content)
-    print(source)
-    if source["datasource"].lower() == "vectorstore":
-        print("---ROUTE QUESTION TO RAG---")
-        data_source = "vectorstore"
-        message = f"CONTEXT:\n\n{context}\n\nQUESTION:\n\n{question}\n\nANSWER:\n"
-        model = "meta-llama/llama-3-70b-instruct:nitro"
-        messages = [
-            {"role": "system", "content": GenerationPrompt},
-            {"role": "user", "content": message},
-        ]
-        output = await client.chat.completions.create(
-            messages=messages, model=model, temperature=0
-        )
-    elif source["datasource"].lower() == "missing_information":
-        print("---NO SUFFICIENT INFORMATION---")
-        data_source = "missing information"
-        message = f"CONTEXT:\n\n{context}\n\nQUESTION:\n\n{question}\n\nANSWER:\n"
-        model = "meta-llama/llama-3-70b-instruct:nitro"
-        messages = [
-            {"role": "system", "content": MissingInformation},
-            {"role": "user", "content": message},
-        ]
-        output = await client.chat.completions.create(
-            messages=messages, model=model, temperature=0
-        )
-    return output
-async def process_queries(queries, pdf_content, file_name):
-    # Run the `answer` function concurrently for all queries
-    async_client = openai.AsyncClient(
-        api_key=OPENROUTER_API_KEY, base_url="https://openrouter.ai/api/v1"
-    )
-    async with async_client as aclient:
-        tasks = [
-            asyncio.create_task(answer(aclient, query, pdf_content, file_name))
-            for query in queries
-        ]
-        responses = await asyncio.gather(*tasks)
-    results = [response.choices[0].message.content for response in responses]
-    return results
-async def web_search(session, question):
-    data = {
-        "topic": "",
-        "description": question,
-        "user_id": "",
-        "user_name": "",
-        "internet": True,
-        "output_format": "report_table",
-        "data_format": "No presets",
-    }
-    async with session.post(
-        "https://pvanand-search-generate-staging.hf.space/generate_report",
-        json=data,
-        headers={"Content-Type": "application/json"},
-    ) as response:
-        print(f"Status: {response.status}")
-        print(f"Headers: {response.headers}")
-        content = await response.text()
-        print(f"Content: {content[:200]}...")  # Print first 200 chars of content
-        if response.headers.get("Content-Type", "").startswith("application/json"):
-            return await response.json()
-        else:
-            raise ValueError(
-                f"Unexpected content type: {response.headers.get('Content-Type')}"
-            )
-async def other_info(pdf_content, file_name):
-    data = await industry(pdf_content, file_name)
-    industry_company = data.get("industry")
-    niche = data.get("niche")
-    # Define the questions for each category
-    questions = {
-        "Risk Involved": f"What are risk involved in the starting a {niche} business in {industry_company}?",
-        "Barrier To Entry": f"What are barrier to entry for a {niche} business in {industry_company}?",
-        "Competitors": f"Who are the main competitors in the market for {niche} business in {industry_company}?",
-        "Challenges": f"What are in the challenges in the {niche} business for {industry_company}?",
-    }
-    # Fetch the results for each category
-    results = {}
-    async with aiohttp.ClientSession() as session:
-        tasks = [web_search(session, question) for question in questions.values()]
-        responses = await asyncio.gather(*tasks, return_exceptions=True)
-    for type_, response in zip(questions, responses):
-        if isinstance(response, Exception):
-            results[type_] = {"error": str(response)}
-        else:
-            results[type_] = markdown(response)
-    return results
-@cache(expire=604800)
-async def upload_pitchdeck(pdf_content: bytes, file_name: str, Funding: float):
-    # Assuming process_queries and other_info are your own defined async functions
-    query_results = await process_queries(queries, pdf_content, file_name)
-    other_info_results = await other_info(pdf_content, file_name)
-    grading_results = json.loads(
-        investment(queries, query_results, other_info_results, Funding)
-    )
-    return {
-        "queries": queries,
-        "query_results": query_results,
-        "other_info_results": other_info_results,
-        "grading_results": grading_results,
-    }
-@app.post("/investor")
-async def process_pitchdeck(file: UploadFile = File(...), Funding: float = Form(...)):
-    if not file:
-        raise HTTPException(status_code=400, detail="PDF file not provided")
-    pdf_content = await file.read()
-    file_name = file.filename
-    return await upload_pitchdeck(pdf_content, file_name, Funding)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)

+from fastapi import FastAPI, UploadFile, HTTPException
+import asyncio
+from utils.PdfUtils import ProcessPdf
+from utils.HelperFunctions import (
+    generate_file_id,
+    save_to_database,
+    retrieve_from_database,
+)
+from utils.VectorDatabase import AdvancedClient
+from utils.ModelCallingFunctions import (
+    industry_finder,
+    other_info,
+    business_information,
+)
+app = FastAPI()
+client = AdvancedClient("VectorDB")
+@app.post(
+    "/get_analysis",
+    responses={
+        200: {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "example": {
+                        "industry": {
+                            "pitch-deck": "File Name",
+                            "industry": "XYZ",
+                            "niche": "ABC",
+                        },
+                        "other_info": {
+                            "Risk Involved": "Markdown",
+                            "Barrier To Entry": "Markdown",
+                            "Competitors": "Markdown",
+                            "Challenges": "Markdown",
+                        },
+                        "business_info": {
+                            "product-and-market": "{...}",
+                            "team-and-strategy": "{...}",
+                            "financials": "{...}",
+                        },
+                    }
+                }
+            },
         }
+    },
+)
+async def get_analysis(pdf_file: UploadFile):
+    if not pdf_file:
+        raise HTTPException(status_code=400, detail="Pitch PDF file not provided")
+    pdf_content = await pdf_file.read()
+    pdf_id = generate_file_id(pdf_content)
+    file_name = pdf_file.filename
+    if pdf_id not in [
+        collection.name for collection in client.client.list_collections()
+    ]:
+        pdf_chunks = ProcessPdf(pdf_content=pdf_content)
+        client.create_collection(collection_id=pdf_id, file_datas=pdf_chunks)
+        # Starting of pitch deck information extraction and structuring
+        industry_info = industry_finder(collection_id=pdf_id)
+        industry_info["pitch-deck"] = file_name
+        other_info_results = asyncio.run(other_info(company_data=industry_info))
+        business_info = asyncio.run(business_information(collection_id=pdf_id))
+        json = {
+            "industry": industry_info,
+            "other_info": other_info_results,
+            "business_info": business_info,
+        }
+        save_to_database(_id=pdf_id, data=json)
+        return json
     else:
+        # Starting of pitch deck information extraction and structuring
+        json = retrieve_from_database(_id=pdf_id)
+        return json

requirements.txt CHANGED Viewed

@@ -1,64 +1,142 @@
 annotated-types==0.7.0
 anyio==4.4.0
-cachetools==5.3.3
-certifi==2024.2.2
 charset-normalizer==3.3.2
 distro==1.9.0
-exceptiongroup==1.2.1
-google-ai-generativelanguage==0.6.4
-google-api-core==2.19.0
-google-api-python-client==2.131.0
-google-auth==2.29.0
 google-auth-httplib2==0.2.0
-google-generativeai==0.5.4
-googleapis-common-protos==1.63.0
-grpcio==1.64.0
-grpcio-status==1.62.2
 h11==0.14.0
 httpcore==1.0.5
 httplib2==0.22.0
-httpx==0.27.0
-idna==3.7
 numpy==1.26.4
-openai==1.30.5
 pandas==2.2.2
-Pillow==9.5.0
-proto-plus==1.23.0
-protobuf==4.25.3
 pyasn1==0.6.0
 pyasn1_modules==0.4.0
-pydantic==2.7.2
-pydantic_core==2.18.3
-PyMuPDF==1.24.5
-PyMuPDFb==1.24.3
-pyparsing==3.1.2
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2024.1
 requests==2.32.3
 rsa==4.9
 six==1.16.0
 sniffio==1.3.1
-tqdm==4.66.4
-typing_extensions==4.12.0
 tzdata==2024.1
 uritemplate==4.1.1
-urllib3==2.2.1
-psycopg2-binary==2.9.9
-pinecone-client==4.1.0
-cohere==5.5.4
-tiktoken==0.7.0
-html2text == 2024.2.26
-mistune==3.0.2
-tenacity==8.3.0
-streamlit==1.35.0
-pdfkit==1.0.0
-Markdown==3.6
-xhtml2pdf== 0.2.16
-reportlab==4.2.0
-beautifulsoup4==4.12.3
-fastapi==0.111.0
-uvicorn==0.29.0
-onnxruntime==1.18.0
-aiohttp==3.9.5
-fastapi-cache2

+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+altair==5.4.1
 annotated-types==0.7.0
 anyio==4.4.0
+asgiref==3.8.1
+async-timeout==4.0.3
+attrs==24.2.0
+backoff==2.2.1
+bcrypt==4.2.0
+blinker==1.8.2
+build==1.2.2
+cachetools==5.5.0
+certifi==2024.8.30
 charset-normalizer==3.3.2
+chroma-hnswlib==0.7.6
+chromadb==0.5.5
+click==8.1.7
+coloredlogs==15.0.1
+Deprecated==1.2.14
 distro==1.9.0
+dnspython==2.6.1
+email_validator==2.2.0
+exceptiongroup==1.2.2
+fastapi==0.114.1
+fastapi-cli==0.0.5
+filelock==3.16.0
+flatbuffers==24.3.25
+frozenlist==1.4.1
+fsspec==2024.9.0
+gitdb==4.0.11
+GitPython==3.1.43
+google-ai-generativelanguage==0.6.6
+google-api-core==2.19.2
+google-api-python-client==2.144.0
+google-auth==2.34.0
 google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.65.0
+grpcio==1.66.1
+grpcio-status==1.62.3
 h11==0.14.0
+html2text==2024.2.26
 httpcore==1.0.5
 httplib2==0.22.0
+httptools==0.6.1
+httpx==0.27.2
+huggingface-hub==0.24.6
+humanfriendly==10.0
+idna==3.8
+importlib_metadata==8.4.0
+importlib_resources==6.4.5
+Jinja2==3.1.4
+jiter==0.5.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+kubernetes==30.1.0
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.1.0
+narwhals==1.6.4
 numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.19.2
+openai==1.44.1
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
 pandas==2.2.2
+pillow==10.4.0
+posthog==3.6.5
+proto-plus==1.24.0
+protobuf==4.25.4
+pyarrow==17.0.0
 pyasn1==0.6.0
 pyasn1_modules==0.4.0
+pydantic==2.9.1
+pydantic_core==2.23.3
+pydeck==0.9.1
+Pygments==2.18.0
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+pyparsing==3.1.4
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+pysqlite3-binary==0.5.3.post1
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
+python-multipart==0.0.9
 pytz==2024.1
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.7.24
 requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.8.1
+rpds-py==0.20.0
 rsa==4.9
+shellingham==1.5.4
 six==1.16.0
+smmap==5.0.1
 sniffio==1.3.1
+starlette==0.38.5
+streamlit==1.38.0
+sympy==1.13.2
+tenacity==8.5.0
+tiktoken==0.7.0
+tokenizers==0.20.0
+toml==0.10.2
+tomli==2.0.1
+tornado==6.4.1
+tqdm==4.66.5
+typer==0.12.5
+typing_extensions==4.12.2
 tzdata==2024.1
 uritemplate==4.1.1
+urllib3==2.2.2
+uvicorn==0.30.6
+uvloop==0.20.0
+watchdog==4.0.2
+watchfiles==0.24.0
+websocket-client==1.8.0
+websockets==13.0.1
+wrapt==1.16.0
+yarl==1.11.1
+zipp==3.20.1

utils/.env ADDED Viewed

	@@ -0,0 +1,4 @@

+TOGETHER_API="85e577a7bd21434e2d3f1ab2bd7a2750c6db5eb7ddf09cce131655911c93f622"
+GEMINI_API="AIzaSyAVUENQ7n8IkQbef0D5uApIo1VVQrSKN9Y"
+OPENROUTER_API_KEY="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"
+X_API_KEY="44d5c2ac18ced6fc25c1e57dcd06fc0b31fb4ad97bf56e67540671a647465df4"

classifier.py → utils/ChartClassifier.py RENAMED Viewed

@@ -13,7 +13,7 @@ class Classifier:
         """
         img : PIL Image object of shape (B,HxW,C)
         """
-        img = img.resize((192,192))
         np_image = np.asarray(img) / 255
         return np_image.astype(np.float32)

         """
         img : PIL Image object of shape (B,HxW,C)
         """
+        img = img.resize((192, 192))
         np_image = np.asarray(img) / 255
         return np_image.astype(np.float32)

utils/HelperFunctions.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import tiktoken
+from typing import List
+import hashlib
+import re
+import sqlite3
+import sqlite3
+import json
+def save_to_database(_id, data):
+    # Connect to the SQLite database (or create it if it doesn't exist)
+    conn = sqlite3.connect("utils/information.db")
+    cursor = conn.cursor()
+    # Create the table if it doesn't exist
+    cursor.execute(
+        """
+    CREATE TABLE IF NOT EXISTS json_data (
+        id TEXT PRIMARY KEY,
+        data TEXT
+    )
+    """
+    )
+    # Insert or replace the data
+    cursor.execute(
+        """
+    INSERT OR REPLACE INTO json_data (id, data)
+    VALUES (?, ?)
+    """,
+        (_id, json.dumps(data)),
+    )
+    # Commit the changes and close the connection
+    conn.commit()
+    conn.close()
+def retrieve_from_database(_id):
+    conn = sqlite3.connect("utils/information.db")
+    cursor = conn.cursor()
+    cursor.execute("SELECT data FROM json_data WHERE id = ?", (_id,))
+    result = cursor.fetchone()
+    conn.close()
+    if result:
+        return json.loads(result[0])
+    else:
+        return None
+def generate_file_id(file_bytes: bytes) -> str:
+    """Generate a Unique file ID for given file."""
+    hash_obj = hashlib.sha256()
+    hash_obj.update(file_bytes[:4096])
+    file_id = hash_obj.hexdigest()[:63]
+    return str(file_id)
+def extract_content(text):
+    pattern = r"<report-chart>(.*?)</report-chart>"
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches[0]
+def CountTokens(texts: List[str]) -> List[int]:
+    """
+    Calculate the number of tokens in a batch of strings.
+    """
+    model = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    encodings = model.encode_batch(texts)
+    num_of_tokens = [len(encoding) for encoding in encodings]
+    return num_of_tokens
+def web_search_result_processor(output):
+    """report_html = output.get("report", "")
+    references = output.get("references", {})
+    references_markdown = ""
+    for url, content in references.items():
+        # Making the URL clickable in pure HTML
+        clickable_url = f'<a href="{url}">{url}</a>'
+        references_markdown += f"<details><summary>{clickable_url}</summary>\n\n{html2text.html2text(content)}</details>\n\n"
+    combined_markdown = ""
+    if report_html.strip():  # Check if report_html is not empty
+        # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily
+        report_markdown = html2text.html2text(report_html)
+        # Remove unwanted newlines within Markdown headings
+        report_markdown = report_markdown.replace("\n", " ").replace("  ", "\n")
+        combined_markdown += report_markdown + "\n\n"
+    combined_markdown += references_markdown"""
+    r = extract_content(output)
+    return r

utils/HyDE.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53acfcd33e1526015a426b059d1636d887bc3ac4e0c7fde62f0e32e456651aa8
+size 111026

utils/ModelCallingFunctions.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+from typing import List
+from PIL import Image
+from dotenv import load_dotenv
+import json
+import pickle
+import asyncio
+import aiohttp
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+from openai import OpenAI, AsyncClient
+import google.generativeai as gemini
+from .VectorDatabase import AdvancedClient
+from .HelperFunctions import web_search_result_processor
+from .prompts import PROMPTS
+load_dotenv("utils/.env")
+TOGETHER_API = os.getenv("TOGETHER_API")
+GEMINI_API = os.getenv("GEMINI_API")
+X_API_KEY = os.getenv("X_API_KEY")
+client = AdvancedClient(vector_database_path="VectorDB")
+with open("utils/HyDE.bin", "rb") as file:
+    HyDE = pickle.load(file)
+def image_data_extractor(img: Image.Image, text: str) -> str:
+    gemini.configure(api_key=GEMINI_API)
+    model = gemini.GenerativeModel("gemini-1.5-flash")
+    prompt = PROMPTS["gemini-image"].format(text=text)
+    response = model.generate_content([prompt, img], stream=False)
+    return response.text
+def generate_embedding(
+    texts: List[str], embedding_model: str = "BAAI/bge-large-en-v1.5"
+) -> List[List[float]]:
+    """Generate Embeddings for the givien pieces of texts."""
+    client = OpenAI(api_key=TOGETHER_API, base_url="https://api.together.xyz/v1")
+    embeddings_response = client.embeddings.create(
+        input=texts, model=embedding_model
+    ).data
+    embeddings = [i.embedding for i in embeddings_response]
+    return embeddings
+def industry_finder(collection_id):
+    question = (
+        "What is the name and its specific niche business this document pertains to."
+    )
+    docs = client.retrieve_chunks(
+        collection_id=collection_id, query=question, number_of_chunks=5
+    )
+    context = "\n\n".join(docs)
+    message = f"CONTEXT\n\n{context}\n\n"
+    model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
+    response_str = response(
+        message=message,
+        model=model,
+        SysPrompt=PROMPTS["industry-finder"],
+        temperature=0,
+    )
+    industry = json.loads(response_str)
+    return industry
+async def web_search(session, question):
+    data = {"query": question, "model_id": "openai/gpt-4o-mini"}
+    async with session.post(
+        "https://general-chat.elevatics.cloud/search-assistant",
+        json=data,
+        headers={"X-API-KEY": X_API_KEY, "Content-Type": "application/json"},
+    ) as response:
+        print(f"Status: {response.status}")
+        print(f"Content: {response.content}")
+        content = await response.text()
+        return content
+async def other_info(company_data):
+    industry_company = company_data.get("industry")
+    niche = company_data.get("niche")
+    # Define the questions for each category
+    questions = {
+        "Risk Involved": f"What are risk involved in the starting a {niche} business in {industry_company}?, please be concise.",
+        "Barrier To Entry": f"What are barrier to entry for a {niche} business in {industry_company}?, please be concise.",
+        "Competitors": f"Who are the main competitors in the market for {niche} business in {industry_company}?, please be concise.",
+        "Challenges": f"What are in the challenges in the {niche} business for {industry_company}?, please be concise.",
+    }
+    # Fetch the results for each category
+    results = {}
+    async with aiohttp.ClientSession() as session:
+        tasks = [web_search(session, question) for question in questions.values()]
+        responses = await asyncio.gather(*tasks, return_exceptions=True)
+    for type_, response in zip(questions, responses):
+        if isinstance(response, Exception):
+            results[type_] = {"error": str(response)}
+        else:
+            results[type_] = response
+    return results
+async def answer(client, context: str, SysPrompt: str):
+    message = f"CONTEXT:\n\n{context}"
+    model = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
+    messages = [
+        {"role": "system", "content": SysPrompt},
+        {"role": "user", "content": message},
+    ]
+    print("herere")
+    response = await client.chat.completions.create(
+        messages=messages, model=model, temperature=0
+    )
+    print("nononon")
+    source = response.choices[0].message.content
+    return source
+async def business_information(collection_id):
+    async_client = AsyncClient(
+        api_key=TOGETHER_API, base_url="https://api.together.xyz/v1"
+    )
+    keys = ["product-and-market", "team-and-strategy", "financials"]
+    async with async_client as aclient:
+        tasks = []
+        for i_key in keys:
+            for j_key in PROMPTS[i_key]:
+                embedding = HyDE[i_key][j_key]
+                sys_prompt = PROMPTS[i_key][j_key]
+                chunks = client.retrieve_chunks(
+                    collection_id=collection_id, query_embedding=embedding
+                )
+                context = "\n\n".join(chunks)
+                tasks.append(
+                    asyncio.create_task(
+                        answer(client=aclient, context=context, SysPrompt=sys_prompt)
+                    )
+                )
+                await asyncio.sleep(1)
+        responses = await asyncio.gather(*tasks)
+    response_dict = {}
+    for i_count, i_key in enumerate(keys):
+        response_dict[i_key] = {}
+        for j_count, j_key in enumerate(PROMPTS[i_key]):
+            response_dict[i_key][j_key] = responses[i_count * 4 + j_count]
+    return response_dict
+def response(
+    message: object,
+    model: object = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+    SysPrompt: object = PROMPTS["default"],
+    temperature: object = 0.2,
+) -> str:
+    """
+    :rtype: object
+    """
+    client = OpenAI(api_key=TOGETHER_API, base_url="https://api.together.xyz/v1")
+    messages = [
+        {"role": "system", "content": SysPrompt},
+        {"role": "user", "content": message},
+    ]
+    @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
+    def completion_with_backoff(**kwargs):
+        print("RETRY")
+        return client.chat.completions.create(**kwargs)
+    try:
+        response = completion_with_backoff(
+            model=model,
+            messages=messages,
+            temperature=temperature,
+            frequency_penalty=0.2,
+        )
+        return str(response.choices[0].message.content)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return "NONE"

utils/PdfUtils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import numpy
+from PIL import Image
+from typing import List, Tuple
+import pymupdf
+from .ChartClassifier import Classifier
+from .HelperFunctions import CountTokens
+from .ModelCallingFunctions import image_data_extractor
+def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
+    "Takes image path and extract information from it, and return it as text."
+    # Start Classifier inference session
+    classifier = Classifier("utils/graph_classifierV2_B.onnx")
+    img_list = []
+    for pixmap in pixmap_list:
+        try:
+            img_list.append(
+                Image.frombytes(
+                    mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
+                )
+            )
+        except Exception as e:
+            print(e)
+    graph_image = classifier.classify(img_list)
+    print(graph_image)
+    response_list = []
+    for idx, is_graph in enumerate(graph_image):
+        if is_graph:
+            response = image_data_extractor(img=img_list[idx], text=text)
+            response_list.append(str(response))
+    return response_list
+def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]:
+    """
+    Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
+    and page number containing that text.
+    """
+    print("Extract content called ")
+    pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")
+    pages_content = []
+    refered_xref = []
+    for page_number in range(pdf_doc.page_count):
+        page_content = ""
+        # extracting text content
+        page = pdf_doc.load_page(page_number)
+        text_content = str(page.get_text()).replace("\n", "\t")
+        page_content += text_content
+        # extracting image content
+        image_list = page.get_image_info(xrefs=True)
+        pixmap_list = []
+        for img_info in image_list:
+            xref = img_info["xref"]
+            if xref not in refered_xref:
+                # if xref not in refered_xref:
+                try:
+                    img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
+                    pixmap_list.append(img_pixmap)
+                    refered_xref.append(xref)
+                except ValueError as e:
+                    print(f"Skipping image with due to error: {e}")
+        if len(pixmap_list) > 0:
+            img_content = extract_image_content(
+                pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
+            )
+            page_content = page_content + "\n\n" + "\n\n".join(img_content)
+        pages_content.append(page_content)
+    num_tokens = CountTokens(pages_content)
+    final_data = []
+    # Logic to handle case when page content > 512 tokens
+    for e, n_token in enumerate(num_tokens):
+        if n_token > 500:
+            n_parts = numpy.ceil(n_token / 500).astype(int)
+            len_content = len(pages_content[e])
+            part_size = len_content // n_parts
+            start, end = 0, part_size
+            temp = []
+            for _ in range(n_parts):
+                temp.append((pages_content[e][start:end], e + 1))
+                start = end
+                end = end + part_size
+            final_data += temp
+        else:
+            final_data.append((pages_content[e], e + 1))
+    pdf_doc.close()
+    return final_data

utils/VectorDatabase.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Contain Wrapper Class for ChormaDB client, that can process and store documents and retrive document chunks.
+"""
+# for chromaDB
+__import__("pysqlite3")
+import sys
+sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+from typing import List, Optional, Tuple
+import chromadb
+class AdvancedClient:
+    def __init__(self, vector_database_path: str = "vectorDB") -> None:
+        self.client = chromadb.PersistentClient(path=vector_database_path)
+    def create_collection(
+        self,
+        collection_id: str,
+        file_datas: List[Tuple[str, int]],
+    ):
+        chunks = []
+        ids = []
+        for chunk, _id in file_datas:
+            chunks.append(chunk)
+            ids.append(str(_id))
+        from .ModelCallingFunctions import generate_embedding
+        embeddings = generate_embedding(texts=chunks)
+        collection = self.client.create_collection(collection_id)
+        collection.add(
+            ids=ids,
+            embeddings=embeddings,  # type: ignore
+            documents=chunks,
+        )
+    def retrieve_chunks(
+        self,
+        collection_id: str,
+        query: str = "NONE",
+        query_embedding: Optional[List[float]] = None,
+        number_of_chunks: int = 3,
+    ):
+        collection = self.client.get_collection(name=collection_id)
+        if query_embedding == None:
+            from .ModelCallingFunctions import generate_embedding
+            query_emb = generate_embedding([query])[0]
+        else:
+            query_emb = query_embedding
+        results = collection.query(
+            query_embeddings=query_emb,
+            n_results=number_of_chunks,
+        )
+        return results["documents"][0]  # pyright: ignore

graph_classifierV2_B.onnx → utils/graph_classifierV2_B.onnx RENAMED Viewed

File without changes

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,55 @@

+PROMPTS = {
+    "gemini-image": "You are provided with the images extracted from a pitch-deck and some text surrounding the image from the same pitch deck. Extract all the factual information that the image is trying to communicate through line charts, area line charts, bar charts, pie charts, tables exectra. Use OCR to extract numerical figures and include them in the information. If the image does not have any information like its a blank image or image of a person then response should be NOTHING. Do not add any additional comments or markdown, just give information. \n\n SURROUNDING TEXT \n\n{text}",
+    "industry-finder": """You are a business strategy consultant. You have been identifying niche markets and industries for companies across various sectors for over 20 years. Your expertise lies in analyzing detailed CONTEXT to accurately pinpoint the niche and industry of a business.
+Objective: Identify the niche and industry of a business by analyzing the provided CONTEXT.
+Steps to follow:
+Read the context: Carefully read the provided information to understand the business's products, services, target audience, and unique value propositions.
+Determine the industry: Based on the provided CONTEXT, identify the primary industry to which the business belongs. Consider factors such as the type of products/services offered, the market served, and industry-specific terminology.
+Identify the niche: Analyze the details to pinpoint the specific niche within the industry. Look for unique aspects of the business, specialized market segments, or specific customer needs that the business addresses.
+Provide output in JSON format: Clearly state the identified industry and niche in a JSON format. Ensure your reasoning supports the identified industry and niche.The output should JSON ,Do not add any additional format.
+Output format:
+{
+  "industry": "Identified industry here",
+  "niche": "Identified niche here",
+}
+Take a deep breath and work on this problem step-by-step.""",
+    "default": "You are now in the role of an expert AI.",
+    "product-and-market": {
+        "product-service-overview": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, analyze the product or services offered by the business. Provide 5 to 10 factual bullet points, with a strong emphasis on the CONTEXT. If the information provided is insufficient or unclear, respond with 'Not enough information to provide a complete analysis.""",
+        "target-customer-problem-solved": """You are an experienced investor evaluating a pitch. Based on the provided CONTEXT, identify the target customers for the business and describe the problem the product or service is solving. Provide 5 to 10 factual bullet points focusing on customer pain points, market size, and the urgency of the problem. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "revenue-and-pricing": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, analyze the revenue streams and pricing strategy of the business. Provide 5 to 10 factual bullet points, focusing on how the business generates revenue, its pricing models, and any monetization strategies. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "market-growth-opportunity": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the market size, potential growth opportunities, and the business’s positioning within the industry. Provide 5 to 10 factual bullet points, focusing on market trends, scalability, and any competitive advantages. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+    },
+    "team-and-strategy": {
+        "competitive-differentiation": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, assess the business’s competitive differentiation. Provide 5 to 10 factual bullet points, focusing on how the company’s product, service, or strategy stands out from competitors, including unique features, intellectual property, or market positioning. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "partnerships-distribution": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the business’s partnerships and distribution strategy. Provide 5 to 10 factual bullet points, focusing on key partnerships, distribution channels, and how these contribute to the company’s growth and market reach. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis""",
+        "sales-marketing": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, assess the business’s sales and marketing strategy. Provide 5 to 10 factual bullet points, focusing on the sales channels, marketing tactics, customer acquisition strategies, and how the business plans to scale its efforts. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "key-members": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the founder and key team members of the business. Provide 5 to 10 factual bullet points, focusing on their backgrounds, relevant experience, and roles in executing the business vision. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+    },
+    "financials": {
+        "financial-peformance": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the company’s financial performance. Provide 5 to 10 factual bullet points, focusing on revenue, profitability, cash flow, and key financial metrics. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "key-metrics": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, analyze the key financial metrics of the company. Provide 5 to 10 factual bullet points, focusing on important indicators such as gross margin, EBITDA, net profit, customer acquisition cost (CAC), lifetime value (LTV), and other relevant metrics. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "cost-drivers": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, assess the company’s cost drivers. Provide 5 to 10 factual bullet points, focusing on the main expenses such as production costs, labor, marketing, technology, or other operational expenses. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+        "cash-projections": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the company’s cash flow projections. Provide 5 to 10 factual bullet points, focusing on projected cash inflows and outflows, sustainability of operations, and how the company plans to manage liquidity over time. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
+    },
+}
+HyDE_text = {
+    "product-and-market": {
+        "product-service-overview": """Imagine a company offering a unique product or service. The product is designed to solve a specific market problem and stands out due to its innovative features or superior quality. It addresses customer pain points and has the potential to scale in the industry. Now, based on the CONTEXT provided, retrieve documents that describe the products or services of the business at hand, highlighting their key attributes and competitive advantages.""",
+        "target-customer-problem-solved": """Imagine a business that has identified a specific customer segment facing a significant problem. The company has designed a product or service that addresses this problem, providing a clear solution. The target customers experience pain points that affect their daily lives or business operations. Now, based on the CONTEXT provided, retrieve documents that detail the target customers and the problem the product or service is solving.""",
+        "revenue-and-pricing": """Imagine a business with multiple revenue streams and a well-thought-out pricing strategy designed to maximize profitability. The company may offer subscription models, one-time purchases, or tiered pricing based on customer needs. The pricing reflects the value delivered to customers and the competitive landscape. Now, based on the CONTEXT provided, retrieve documents that explain the revenue streams and pricing strategy of the business.""",
+        "market-growth-opportunity": """Imagine a company operating in a high-potential market with significant growth opportunities. The industry is expanding due to emerging trends, customer demand, or technological advancements. The company has identified key opportunities for growth, such as expanding to new markets, increasing customer segments, or leveraging competitive advantages. Now, based on the CONTEXT provided, retrieve documents that discuss the market size and growth opportunities for the business.""",
+    },
+    "team-and-strategy": {
+        "competitive-differentiation": """Imagine a company that distinguishes itself from competitors through innovative products, unique services, or a strong market position. The company may have proprietary technology, a superior user experience, or a first-mover advantage. Now, based on the CONTEXT provided, retrieve documents that explain how the business differentiates itself from competitors and maintains a competitive edge""",
+        "partnerships-distribution": """Imagine a company that has established strategic partnerships and a robust distribution network to enhance its market reach. The partnerships may involve suppliers, distributors, or technology partners, while the distribution strategy ensures the product or service is accessible to the target market through various channels. Now, based on the CONTEXT provided, retrieve documents that discuss the business’s key partnerships and distribution strategies.""",
+        "sales-marketing": """Imagine a business with a well-defined sales and marketing strategy aimed at acquiring and retaining customers. The company uses various sales channels such as direct sales, online platforms, or partnerships, and employs marketing tactics like digital marketing, social media, or paid advertising to reach its audience. Now, based on the CONTEXT provided, retrieve documents that detail the business’s sales channels, marketing strategies, and customer acquisition plans.""",
+        "key-members": """Imagine a business led by a visionary founder with a team of skilled professionals, each playing a critical role in driving the company forward. The founder brings relevant industry experience, and the key team members have complementary skills and leadership in areas like marketing, operations, or technology. Now, based on the CONTEXT provided, retrieve documents that describe the backgrounds, qualifications, and roles of the founder and key team members.""",
+    },
+    "financials": {
+        "financial-peformance": """Imagine a company with a track record of financial performance, showing key metrics like revenue growth, profitability, and cash flow management. The company’s financial statements reflect its financial health, including historical performance and trends. Now, based on the CONTEXT provided, retrieve documents that detail the company’s financial performance, including revenue, profit margins, and other key financial indicators.""",
+        "key-metrics": """Imagine a company presenting its key financial metrics to demonstrate its financial health and efficiency. Metrics like gross margin, EBITDA, net profit, and customer acquisition costs reflect the company’s profitability and operational efficiency. Now, based on the CONTEXT provided, retrieve documents that highlight the company's key financial metrics, including profitability, operational efficiency, and customer-related metrics.""",
+        "cost-drivers": """Imagine a company identifying its main cost drivers that impact profitability. These might include production costs, labor expenses, marketing, technology investments, or operational overhead. Now, based on the CONTEXT provided, retrieve documents that explain the company’s major cost drivers and how these expenses affect its overall financial performance.""",
+        "cash-projections": """Imagine a company presenting its cash flow projections, showing expected cash inflows and outflows over the coming quarters or years. The projections reflect the company’s ability to sustain operations, grow, and manage liquidity. Now, based on the CONTEXT provided, retrieve documents that explain the company’s cash flow projections, including key assumptions and expected financial outcomes.""",
+    },
+}