ashishbangwal commited on
Commit
37a2681
·
1 Parent(s): 9f65633

copy from nihtin repo | LFS issue solved

Browse files
Files changed (5) hide show
  1. Dockerfile +15 -0
  2. classifier.py +31 -0
  3. graph_classifierV2_B.onnx +3 -0
  4. main.py +906 -0
  5. requirements.txt +64 -0
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11.9
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the current directory contents into the container at /app
8
+ COPY . /app
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+
14
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
15
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
classifier.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import onnxruntime
3
+
4
+
5
+ class Classifier:
6
+ def __init__(self, onnx_fp: str) -> None:
7
+ try:
8
+ self.classifier = onnxruntime.InferenceSession(path_or_bytes=onnx_fp)
9
+ except Exception as e:
10
+ print(e)
11
+
12
+ def preprocess(self, img):
13
+ """
14
+ img : PIL Image object of shape (B,HxW,C)
15
+ """
16
+ img = img.resize((192,192))
17
+ np_image = np.asarray(img) / 255
18
+ return np_image.astype(np.float32)
19
+
20
+ def classify(self, imgs):
21
+ # preprocess
22
+ processed_imgs = []
23
+ for img in imgs:
24
+ pi = self.preprocess(img)
25
+ processed_imgs.append(pi)
26
+
27
+ batch = np.array(processed_imgs)
28
+ onnx_input = {"images": batch}
29
+ prediction = self.classifier.run(None, onnx_input)
30
+
31
+ return (prediction[0] > 0.5).astype(np.int8).flatten().tolist()
graph_classifierV2_B.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60c6e216dcd96b5304d737eef29f40d7e878744346045ae1165c884db0ab9518
3
+ size 16314912
main.py ADDED
@@ -0,0 +1,906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import psycopg2
4
+ import time
5
+ import re
6
+ import asyncio
7
+ import cohere
8
+ import numpy
9
+ import streamlit as st
10
+ import pdfkit
11
+ import json
12
+ import requests
13
+ import tempfile
14
+ import mistune
15
+ import markdown as md
16
+ import psycopg2
17
+ import html2text
18
+ from typing import List, Tuple, Dict
19
+ from pinecone import Pinecone, ServerlessSpec
20
+ import openai
21
+ import os
22
+ import pymupdf
23
+ import tiktoken
24
+ import google.generativeai as gemini
25
+ from PIL import Image
26
+ from PIL import PngImagePlugin # important to avoid google_genai AttributeError
27
+ import json
28
+ import hashlib
29
+ from dotenv import load_dotenv
30
+ from classifier import Classifier
31
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
32
+ from fastapi import FastAPI, HTTPException, UploadFile, File,Form
33
+ from fastapi.middleware.cors import CORSMiddleware
34
+ from fastapi_cache import FastAPICache
35
+ from fastapi_cache.backends.inmemory import InMemoryBackend
36
+ from fastapi_cache.decorator import cache
37
+ import aiohttp
38
+
39
+ app = FastAPI()
40
+
41
+ @app.on_event("startup")
42
+ async def startup():
43
+ FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache")
44
+
45
+ load_dotenv()
46
+
47
+ TOGETHER_API_KEY = os.getenv("TOGETHER_API")
48
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
49
+ COHERE_API = os.getenv("COHERE_API")
50
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
51
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
52
+
53
+ gemini.configure(api_key=GEMINI_API_KEY)
54
+
55
+ client = openai.OpenAI(base_url="https://api.together.xyz/v1", api_key=TOGETHER_API_KEY)
56
+
57
+ SysPromptDefault = "You are now in the role of an expert AI."
58
+ GenerationPrompt = """You are an expert AI whose task is to ANSWER the user's QUESTION using the provided CONTEXT.
59
+ Forget everything you know, Fully rely on CONTEXT to provide the answer.
60
+ Follow these steps:
61
+ 1. Think deeply and multiple times about the user's QUESTION. You must understand the intent of their question and provide the most appropriate answer.
62
+ 2. Choose the most relevant content from the CONTEXT that addresses the user's question and use it to generate an answer.
63
+ Formating Instructions:
64
+ Respond only in markdown format; don't use big headings"""
65
+ QuestionRouter = """ You are an expert investor, You must identify if the provided CONTEXT can answer the user QUESTION.
66
+ 1 vectorstore : The provided CONTEXT is sufficient to answer the question.
67
+ 2 missing_information : The provided CONTEXT does not contains the answer.
68
+ output options: 'vectorstore' OR 'missing_information'.The output must be a valid JSON.Do not add any additional comments.
69
+ Output format:
70
+ {
71
+ "datasource":"identified option"
72
+ }
73
+ Return the a valid JSON with a single key 'datasource' and no preamble or explanation. Question to identify: QUESTION """
74
+ MissingInformation = """You are an expert in identifying missing information in a given CONTEXT to answer a QUESTION effectively. Your task is to analyze the CONTEXT, and pinpoint the missing content needed for each QUESTION. Take your time to process the information thoroughly and provide a list output without any additional comments. The output format should be valid markdown list , without any additional comments:
75
+ """
76
+ SummaryPrompt = """You are an expert AI specializing in document summarization. You have been refining your skills in text summarization and data extraction for over a decade, ensuring the highest accuracy and clarity. Your task is to read raw data from a PDF file page by page and provide a detailed summary of the CONTEXT while ensuring all numerical data is included in the summary without alteration. The output should be in Markdown format, with appropriate headers and lists to enhance readability. Follow these instructions:
77
+ 1.Summarize the Text: Provide a concise summary of the CONTEXT, capturing the main points and essential information.
78
+ 2.Retain Numerical Data: Ensure all numerical data (e.g., dates, statistics, financial figures, percentages, measurements) is included in the summary.
79
+ 3.Markdown Format: Format the output in Markdown, using headers, lists, and other Markdown elements appropriately.
80
+ Note: Whenever the CONTEXT is about a TEAM, DO NOT summarize; instead, output the content in a neat markdown format with Names and previous designation of the TEAM.
81
+ """
82
+ IndustryPrompt = """You are a business strategy consultant. You have been identifying niche markets and industries for companies across various sectors for over 20 years. Your expertise lies in analyzing detailed CONTEXT to accurately pinpoint the niche and industry of a business.
83
+ Objective: Identify the niche and industry of a business by analyzing the provided CONTEXT.
84
+ Steps to follow:
85
+ Read the context: Carefully read the provided information to understand the business's products, services, target audience, and unique value propositions.
86
+ Determine the industry: Based on the provided CONTEXT, identify the primary industry to which the business belongs. Consider factors such as the type of products/services offered, the market served, and industry-specific terminology.
87
+ Identify the niche: Analyze the details to pinpoint the specific niche within the industry. Look for unique aspects of the business, specialized market segments, or specific customer needs that the business addresses.
88
+ Provide output in JSON format: Clearly state the identified industry and niche in a JSON format. Ensure your reasoning supports the identified industry and niche.The output should JSON ,Do not add any additional format.
89
+ Output format:
90
+ {
91
+ "industry": "Identified industry here",
92
+ "niche": "Identified niche here",
93
+ "reasoning": "Explanation of how the industry and niche were identified based on the context"
94
+ }
95
+ Take a deep breath and work on this problem step-by-step.
96
+ """
97
+
98
+ Investment = """You are a professional financial analyst evaluating sectors for investment potential. Your task is to:
99
+
100
+ 1. Identify the sector from the provided CONTEXT.
101
+ 2. Grade only the specified KEYS on a scale of 1-10, with higher grades indicating better investment potential. Take a conservative approach in grading.
102
+ 3. Provide reasoning for each grade considering both qualitative and quantitative factors, including the FUNDING information.
103
+ 4. Assign weights to each section (total should equal 1).
104
+ 5. Calculate an overall weighted score.
105
+
106
+ Use only the information given in the CONTEXT and the FUNDING provided. Be conservative in your grading to reflect investment risks. Output your analysis in the following JSON format:
107
+
108
+ ```json
109
+ {
110
+ "sector": "Sector name",
111
+ "sections": [
112
+ {
113
+ "section": "Key from Context",
114
+ "score": "Grade (1-10)",
115
+ "weight": "Weight (0-1)",
116
+ "reasoning": "Detailed analysis including funding considerations"
117
+ }
118
+ ],
119
+ "overall_score": "Calculated weighted score"
120
+ }
121
+ ```
122
+
123
+ Grade only these KEYS from the CONTEXT:
124
+ 1. "What are the company's projected revenue, expenses, and profits for the future and cash flow projections?"
125
+ 2. "What is the founder's experience and track record, along with the key team members' bios, background checks, and their roles and responsibilities?"
126
+ 3. "How does the company's product/service differentiate itself from competitors in the market?"
127
+ 4. "What issue or challenge is the company addressing?"
128
+ 5. "Risks Involved"
129
+ 6. "Barrier To Entry"
130
+ 7. "Competitors"
131
+ 8. "Challenges"
132
+
133
+ Additionally, consider the FUNDING provided by the user:
134
+ - The FUNDING will be classified as follows:
135
+ * Low: Less than $1 million
136
+ * Medium: $1 million to $10 million
137
+ * High: More than $10 million
138
+ - Adjust your scoring based on the funding classification:
139
+ * For Low funding: Reduce scores by 1-2 points where relevant.
140
+ * For Medium funding: Keep scores as they would be without considering funding.
141
+ * For High funding: Increase scores by 1-2 points where relevant.
142
+ - Incorporate the funding information into your analysis of each relevant key.
143
+ - Consider how the funding level impacts various aspects such as projected financials, ability to execute plans, competitive positioning, and risk mitigation.
144
+ - Reflect the impact of funding in your scoring and reasoning for each relevant key.
145
+ -Don't explicitly mention the original funding in the answer but use them to give reasoning.
146
+
147
+
148
+ Provide your analysis based on the CONTEXT and FUNDING that will be given.
149
+ """
150
+
151
+ queries = [
152
+ "What is the company's product/service, and what are its key features?",
153
+ "Who is the target customer for the company's product/service, and what problem does it solve for them?",
154
+ "What are the company's revenue streams?",
155
+ "How does the company price its products/services?"
156
+ "What are the key cost drivers and profit margins for the company?",
157
+ "What opportunities for growth and expansion does the company foresee?",
158
+ "Who is the target market for the company's product/service, and how does the company plan to reach them?",
159
+ "What sales channels and distribution partnerships does the company have in place?",
160
+ "How is the company's marketing budget allocated?",
161
+ "What is the company's historical financial performance, including growth rate?",
162
+ "What are the company's projected revenue, expenses, and profits for the future and cash flow projections?",
163
+ "What is the founder's experience and track record, along with the key team members' bios, background checks, and their roles and responsibilities?",
164
+ "How does the company's product/service differentiate itself from competitors in the market?",
165
+ "What issue or challenge is the company addressing?",
166
+ ]
167
+
168
+ document_processing_event = asyncio.Event()
169
+ document_processing_event.set()
170
+
171
+
172
+ def get_digest(pdf_content):
173
+ h = hashlib.sha256()
174
+ h.update(pdf_content) # Hash the binary content of the PDF
175
+ return h.hexdigest()
176
+
177
+
178
+ def response(
179
+ message: object,
180
+ model: object = "meta-llama/llama-3-70b-instruct:nitro",
181
+ SysPrompt: object = SysPromptDefault,
182
+ temperature: object = 0.2,
183
+ ) -> object:
184
+ """
185
+ :rtype: object
186
+ """
187
+ client = openai.OpenAI(
188
+ api_key=OPENROUTER_API_KEY,
189
+ base_url="https://openrouter.ai/api/v1",
190
+ )
191
+
192
+ messages = [
193
+ {"role": "system", "content": SysPrompt},
194
+ {"role": "user", "content": message},
195
+ ]
196
+
197
+ @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
198
+ def completion_with_backoff(**kwargs):
199
+ print("RETRY")
200
+ return client.chat.completions.create(**kwargs)
201
+
202
+ try:
203
+ response = completion_with_backoff(
204
+ model=model,
205
+ messages=messages,
206
+ temperature=temperature,
207
+ frequency_penalty=0.2,
208
+ )
209
+ return response.choices[0].message.content
210
+ except Exception as e:
211
+ print(f"An error occurred: {e}")
212
+
213
+
214
+ def number_of_tokens(texts: List[str]) -> List[int]:
215
+ """
216
+ Calculate the number of tokens in a batch of strings.
217
+ """
218
+ model = tiktoken.encoding_for_model("gpt-3.5-turbo")
219
+ encodings = model.encode_batch(texts)
220
+ num_of_tokens = [len(encoding) for encoding in encodings]
221
+ return num_of_tokens
222
+
223
+
224
+ def limit_tokens(input_string, token_limit=5500):
225
+ """
226
+
227
+ Limit tokens sent to the model
228
+
229
+ """
230
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
231
+ return encoding.decode(encoding.encode(input_string)[:token_limit])
232
+
233
+
234
+ def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
235
+ "Takes image path and extract information from it, and return it as text."
236
+
237
+ # Start Classifier inference session
238
+ classifier = Classifier("graph_classifierV2_B.onnx")
239
+ # Model for img to text
240
+ model = gemini.GenerativeModel("gemini-1.5-flash")
241
+
242
+ description_prompt = f"You are provided with the images extracted from a pitch-deck and some text surrounding the image from the same pitch deck. Extract all the factual information that the image is trying to communicate through line charts, area line charts, bar charts, pie charts, tables exectra. Use OCR to extract numerical figures and include them in the information. If the image does not have any information like its a blank image or image of a person then response should be NOTHING. Do not add any additional comments or markdown, just give information. \n\n SURROUNDING TEXT \n\n{text}"
243
+
244
+ img_list = []
245
+
246
+ for pixmap in pixmap_list:
247
+ try:
248
+ img_list.append(
249
+ Image.frombytes(
250
+ mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
251
+ )
252
+ )
253
+ except Exception as e:
254
+ print(e)
255
+
256
+ graph_image = classifier.classify(img_list)
257
+ print(graph_image)
258
+
259
+ response_list = []
260
+
261
+ for idx, is_graph in enumerate(graph_image):
262
+ if is_graph:
263
+ response = model.generate_content(
264
+ [description_prompt, img_list[idx]], stream=False
265
+ )
266
+ print("\n\n", response.text, "\n\n")
267
+ response_list.append(str(response.text))
268
+
269
+ return response_list
270
+
271
+
272
+ def extract_content(pdf_content: bytes) -> List[Tuple[str, int]]:
273
+ """
274
+ Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
275
+ and page number containing that text.
276
+ """
277
+ print("Extract content called ")
278
+ pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")
279
+
280
+ pages_content = []
281
+ refered_xref = []
282
+ for page_number in range(pdf_doc.page_count):
283
+ page_content = ""
284
+
285
+ # extracting text content
286
+ page = pdf_doc.load_page(page_number)
287
+ text_content = str(page.get_text()).replace("\n", "\t")
288
+ page_content += text_content
289
+
290
+ # extracting image content
291
+ image_list = page.get_image_info(xrefs=True)
292
+ pixmap_list = []
293
+ for img_info in image_list:
294
+ xref = img_info["xref"]
295
+ if xref not in refered_xref:
296
+ # if xref not in refered_xref:
297
+ try:
298
+ img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
299
+ pixmap_list.append(img_pixmap)
300
+ refered_xref.append(xref)
301
+ except ValueError as e:
302
+ print(f"Skipping image with due to error: {e}")
303
+ if len(pixmap_list) > 0:
304
+ img_content = extract_image_content(
305
+ pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
306
+ )
307
+ page_content = page_content + "\n\n" + "\n\n".join(img_content)
308
+
309
+ pages_content.append(page_content)
310
+
311
+ num_tokens = number_of_tokens(pages_content)
312
+
313
+ final_data = []
314
+
315
+ # Logic to handle case when page content > 512 tokens
316
+ for e, n_token in enumerate(num_tokens):
317
+ if n_token > 500:
318
+ n_parts = numpy.ceil(n_token / 500).astype(int)
319
+ len_content = len(pages_content[e])
320
+ part_size = len_content // n_parts
321
+ start, end = 0, part_size
322
+ temp = []
323
+ for _ in range(n_parts):
324
+ temp.append((pages_content[e][start:end], e + 1))
325
+ start = end
326
+ end = end + part_size
327
+ final_data += temp
328
+ else:
329
+ final_data.append((pages_content[e], e + 1))
330
+
331
+ pdf_doc.close()
332
+ print(final_data)
333
+ return final_data
334
+
335
+
336
+ def markdown(output):
337
+ report_html = output.get("report", "")
338
+ references = output.get("references", {})
339
+ references_markdown = ""
340
+
341
+ for url, content in references.items():
342
+ # Making the URL clickable in pure HTML
343
+ clickable_url = f'<a href="{url}">{url}</a>'
344
+ references_markdown += f"<details><summary>{clickable_url}</summary>\n\n{html2text.html2text(content)}</details>\n\n"
345
+
346
+ combined_markdown = ""
347
+ if report_html.strip(): # Check if report_html is not empty
348
+ # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily
349
+ report_markdown = html2text.html2text(report_html)
350
+ # Remove unwanted newlines within Markdown headings
351
+ report_markdown = report_markdown.replace('\n', ' ').replace(' ', '\n')
352
+ combined_markdown += report_markdown + "\n\n"
353
+ combined_markdown += references_markdown
354
+ return combined_markdown
355
+
356
+
357
+ def pinecone_server():
358
+ pc = Pinecone(api_key=PINECONE_API_KEY)
359
+ index_name = "investment"
360
+ if index_name not in pc.list_indexes().names():
361
+ pc.create_index(
362
+ index_name,
363
+ dimension=1024,
364
+ metric="cosine",
365
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
366
+ )
367
+ time.sleep(1)
368
+ index = pc.Index(index_name)
369
+ index.describe_index_stats()
370
+ return index
371
+
372
+
373
+ def fetch_vectorstore_from_db(file_id):
374
+ conn = psycopg2.connect(
375
+ dbname="postgres",
376
+ user="postgres.kstfnkkxavowoutfytoq",
377
+ password="nI20th0in3@",
378
+ host="aws-0-us-east-1.pooler.supabase.com",
379
+ port="5432",
380
+ )
381
+ cur = conn.cursor()
382
+ create_table_query = """
383
+ CREATE TABLE IF NOT EXISTS investment_research_pro (
384
+ file_id VARCHAR(1024) PRIMARY KEY,
385
+ file_name VARCHAR(1024),
386
+ name_space VARCHAR(1024)
387
+
388
+ );
389
+ """
390
+ cur.execute(create_table_query)
391
+ conn.commit()
392
+ fetch_query = """
393
+ SELECT name_space
394
+ FROM investment_research_pro
395
+ WHERE file_id = %s;
396
+ """
397
+ cur.execute(fetch_query, (file_id,))
398
+ result = cur.fetchone()
399
+ cur.close()
400
+ conn.close()
401
+ if result:
402
+ return result[0]
403
+ return None
404
+
405
+
406
+ def get_next_namespace():
407
+ conn = psycopg2.connect(
408
+ dbname="postgres",
409
+ user="postgres.kstfnkkxavowoutfytoq",
410
+ password="nI20th0in3@",
411
+ host="aws-0-us-east-1.pooler.supabase.com",
412
+ port="5432",
413
+ )
414
+ cur = conn.cursor()
415
+ cur.execute("SELECT COUNT(*) FROM investment_research_pro")
416
+ count = cur.fetchone()[0]
417
+ next_namespace = f"pdf-{count + 1}"
418
+ cur.close()
419
+ conn.close()
420
+ return next_namespace
421
+
422
+
423
+ def insert_data(file_id, file_name, name_space):
424
+
425
+ print("inserted")
426
+ conn = psycopg2.connect(
427
+ dbname="postgres",
428
+ user="postgres.kstfnkkxavowoutfytoq",
429
+ password="nI20th0in3@",
430
+ host="aws-0-us-east-1.pooler.supabase.com",
431
+ port="5432",
432
+ )
433
+ cur = conn.cursor()
434
+ create_table_query = """
435
+ CREATE TABLE IF NOT EXISTS investment_research_pro (
436
+ file_id VARCHAR(1024) PRIMARY KEY,
437
+ file_name VARCHAR(1024),
438
+ name_space VARCHAR(255)
439
+ );
440
+ """
441
+ cur.execute(create_table_query)
442
+ conn.commit()
443
+ insert_query = """
444
+ INSERT INTO investment_research_pro (file_id, file_name, name_space)
445
+ VALUES (%s, %s, %s)
446
+ ON CONFLICT (file_id) DO NOTHING;
447
+ """
448
+ cur.execute(insert_query, (file_id, file_name, name_space))
449
+ conn.commit()
450
+ cur.close()
451
+ conn.close()
452
+
453
+
454
+ def create_documents(page_contents):
455
+ documents = []
456
+
457
+ for content, page_number in page_contents:
458
+ doc = {
459
+ "page_content": content,
460
+ "metadata": {"page_number": page_number, "original_content": content},
461
+ }
462
+ documents.append(doc)
463
+
464
+ return documents
465
+
466
+
467
+ def embed_and_upsert(documents, name_space):
468
+ chunks = [doc["page_content"] for doc in documents]
469
+ pinecone_index = pinecone_server()
470
+ embeddings_response = client.embeddings.create(
471
+ input=chunks, model="BAAI/bge-large-en-v1.5"
472
+ ).data
473
+ embeddings = [i.embedding for i in embeddings_response]
474
+ pinecone_data = []
475
+ for doc, embedding in zip(documents, embeddings):
476
+ i = str(uuid.uuid4())
477
+ pinecone_data.append(
478
+ {"id": i, "values": embedding, "metadata": doc["metadata"]}
479
+ )
480
+
481
+ pinecone_index.upsert(vectors=pinecone_data, namespace=name_space)
482
+
483
+
484
+ def embedding_creation(pdf_content, name_space):
485
+ data = extract_content(pdf_content)
486
+ # text_data = [i[0] for i in data]
487
+ documents = create_documents(data)
488
+ embed_and_upsert(documents, name_space)
489
+ print("Embeddings created and upserted successfully into Pinecone.")
490
+
491
+
492
+ def embed(question):
493
+ embeddings_response = client.embeddings.create(
494
+ input=[question],
495
+ model="BAAI/bge-large-en-v1.5",
496
+ ).data
497
+ embeddings = embeddings_response[0].embedding
498
+ return embeddings
499
+
500
+
501
+ def process_rerank_response(rerank_response, docs):
502
+ rerank_docs = []
503
+ for item in rerank_response.results:
504
+ index = item.index
505
+ if 0 <= index < len(docs):
506
+ rerank_docs.append(docs[index])
507
+ else:
508
+ print(f"Warning: Index {index} is out of range for documents list.")
509
+ return rerank_docs
510
+
511
+
512
+ async def get_docs(question, pdf_content, file_name):
513
+ global document_processing_event
514
+ index = pinecone_server()
515
+ co = cohere.Client(COHERE_API)
516
+ xq = embed(question)
517
+
518
+ await document_processing_event.wait()
519
+ file_id = get_digest(pdf_content)
520
+ existing_namespace = fetch_vectorstore_from_db(file_id)
521
+
522
+ if existing_namespace:
523
+ print("Document already exists. Using existing namespace.")
524
+ name_space = existing_namespace
525
+ else:
526
+ document_processing_event.clear()
527
+ print("evet stopped")
528
+ print("Document is new. Creating embeddings and new namespace.")
529
+ name_space = get_next_namespace()
530
+ print(name_space)
531
+ embedding_creation(pdf_content, name_space)
532
+ insert_data(file_id, file_name, name_space)
533
+ print("Sleep complete....")
534
+ # except Exception as e:
535
+ # print(e)
536
+ # finally:
537
+ print("finally called")
538
+ document_processing_event.set()
539
+
540
+ # Query is now inside the lock to ensure it happens after any new document processing
541
+ res = index.query(namespace=name_space, vector=xq, top_k=5, include_metadata=True)
542
+
543
+ print(res)
544
+ docs = [x["metadata"]["original_content"] for x in res["matches"]]
545
+
546
+ if not docs:
547
+ print("No matching documents found.")
548
+ return []
549
+
550
+ results = co.rerank(
551
+ query=question, documents=docs, top_n=3, model="rerank-english-v3.0"
552
+ )
553
+ reranked_docs = process_rerank_response(results, docs)
554
+ return reranked_docs
555
+
556
+
557
+ async def industry(pdf_content, file_name):
558
+ question = (
559
+ "What is the name and its specific niche business this document pertains to."
560
+ )
561
+ docs = await get_docs(question, pdf_content, file_name)
562
+ context = "\n\n".join(docs)
563
+ message = f"CONTEXT\n\n{context}\n\n"
564
+ model = "meta-llama/llama-3-70b-instruct:nitro"
565
+ response_str = response(
566
+ message=message, model=model, SysPrompt=IndustryPrompt, temperature=0
567
+ )
568
+ industry = json.loads(response_str)
569
+ print(industry)
570
+ return industry
571
+
572
+
573
+ def split_into_chunks(input_string, token_limit=4500):
574
+ # Initialize the tokenizer for the model
575
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
576
+
577
+ # Encode the input string to get the tokens
578
+ tokens = encoding.encode(input_string)
579
+
580
+ # List to store chunks
581
+ chunks = []
582
+ start = 0
583
+
584
+ # Iterate over the tokens and split into chunks
585
+ while start < len(tokens):
586
+ end = start + token_limit
587
+ if end >= len(tokens):
588
+ chunk_tokens = tokens[start:]
589
+ else:
590
+ break_point = end
591
+ while break_point > start and tokens[break_point] not in encoding.encode(
592
+ " "
593
+ ):
594
+ break_point -= 1
595
+
596
+ if break_point == start:
597
+ chunk_tokens = tokens[start:end]
598
+ else:
599
+ chunk_tokens = tokens[start:break_point]
600
+ end = break_point
601
+
602
+ chunk = encoding.decode(chunk_tokens)
603
+ chunks.append(chunk)
604
+ start = end
605
+
606
+ return chunks
607
+
608
+
609
+ def further_split_chunk(chunk, token_limit):
610
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
611
+ tokens = encoding.encode(chunk)
612
+ sub_chunks = []
613
+ start = 0
614
+
615
+ while start < len(tokens):
616
+ end = start + token_limit
617
+ if end >= len(tokens):
618
+ sub_chunk_tokens = tokens[start:]
619
+ else:
620
+ break_point = end
621
+ while break_point > start and tokens[break_point] not in encoding.encode(
622
+ " "
623
+ ):
624
+ break_point -= 1
625
+
626
+ if break_point == start:
627
+ sub_chunk_tokens = tokens[start:end]
628
+ else:
629
+ sub_chunk_tokens = tokens[start:break_point]
630
+ end = break_point
631
+
632
+ sub_chunk = encoding.decode(sub_chunk_tokens)
633
+ sub_chunks.append(sub_chunk)
634
+ start = end
635
+
636
+ return sub_chunks
637
+
638
+
639
+ # Define the investment function
640
+ def investment(queries, query_results, other_info_results,Funding):
641
+
642
+ # Combine queries and query_results into a dictionary
643
+ combined_results = {q: r for q, r in zip(queries[-4:], query_results[-4:])}
644
+
645
+ # Extract keys and answers from the other_info_results and update the combined_results dictionary
646
+ for key, value in other_info_results.items():
647
+ if isinstance(value, str): # Check if the value is a string
648
+ combined_results[key] = value.split("<details><summary>")[0].strip()
649
+ else:
650
+ combined_results[key] = value
651
+ print(combined_results)
652
+ message = f"CONTEXT:\n\n{json.dumps(combined_results, indent=4)}\n\nFUNDING:\n\n{Funding}\n\n"
653
+
654
+ sys_prompt = Investment
655
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
656
+ sys_prompt_token_size = len(encoding.encode(sys_prompt))
657
+
658
+ max_model_tokens = 7000
659
+ max_chunk_size = 7000 # Adjust to leave more buffer space
660
+
661
+ chunks = split_into_chunks(message, token_limit=max_chunk_size)
662
+
663
+ model = "anthropic/claude-3.5-sonnet"
664
+ responses = []
665
+ tokens_used = 0
666
+ max_tokens_per_minute = 7000
667
+
668
+ for chunk in chunks:
669
+ chunk_token_size = len(encoding.encode(chunk))
670
+ combined_message = f"{sys_prompt}\n{chunk}"
671
+ combined_token_size = len(encoding.encode(combined_message))
672
+
673
+ print(
674
+ f"Token size of the combined message and SysPrompt for this chunk: {combined_token_size}"
675
+ )
676
+ print(f"Chunk token size: {chunk_token_size}")
677
+ print(f"SysPrompt token size: {sys_prompt_token_size}")
678
+
679
+ if combined_token_size > max_model_tokens:
680
+ print(
681
+ f"Warning: Combined token size ({combined_token_size}) exceeds the model's limit ({max_model_tokens}). Adjusting chunk size."
682
+ )
683
+ sub_chunks = further_split_chunk(
684
+ chunk, max_model_tokens - sys_prompt_token_size
685
+ )
686
+ for sub_chunk in sub_chunks:
687
+ sub_chunk_token_size = len(encoding.encode(sub_chunk))
688
+ print(sub_chunk_token_size)
689
+ if sub_chunk_token_size > 500:
690
+ sub_combined_message = f"{sys_prompt}\n{sub_chunk}"
691
+ sub_combined_token_size = len(encoding.encode(sub_combined_message))
692
+ if sub_combined_token_size <= max_model_tokens:
693
+ response_str = response(
694
+ message=sub_chunk,
695
+ model=model,
696
+ SysPrompt=sys_prompt,
697
+ temperature=0,
698
+ )
699
+ print(response_str)
700
+ json_part = extract_json(response_str)
701
+ if json_part:
702
+ responses.append(json_part)
703
+ else:
704
+ print("Warning: No valid JSON part found in the response.")
705
+ tokens_used += sub_combined_token_size
706
+ if tokens_used >= max_tokens_per_minute:
707
+ print("Waiting for 60 seconds to avoid rate limit.")
708
+ time.sleep(60)
709
+ tokens_used = 0
710
+ else:
711
+ if chunk_token_size >= 500:
712
+ response_str = response(
713
+ message=chunk, model=model, SysPrompt=sys_prompt, temperature=0
714
+ )
715
+ print(response_str)
716
+ json_part = extract_json(response_str)
717
+ if json_part:
718
+ responses.append(json_part)
719
+ else:
720
+ print("Warning: No valid JSON part found in the response.")
721
+ tokens_used += combined_token_size
722
+ if tokens_used >= max_tokens_per_minute:
723
+ print("Waiting for 60 seconds to avoid rate limit.")
724
+ time.sleep(60)
725
+ tokens_used = 0
726
+
727
+ combined_json = {"sectors": [], "final_score": 0}
728
+ total_score = 0
729
+ count = 0
730
+
731
+ for response_str in responses:
732
+ response_json = json.loads(response_str)
733
+ combined_json["sectors"].append(response_json)
734
+ total_score += response_json["overall_score"]
735
+ count += 1
736
+
737
+ if count > 0:
738
+ combined_json["final_score"] = total_score / count
739
+ final_json = json.dumps(combined_json, indent=4)
740
+ print(final_json)
741
+ return final_json
742
+
743
+
744
+ def extract_json(response_str):
745
+ """Extract the JSON part from the response string."""
746
+ match = re.search(r"\{.*}", response_str, re.DOTALL)
747
+ if match:
748
+ json_part = match.group()
749
+ try:
750
+ json.loads(json_part) # Check if it's valid JSON
751
+ return json_part
752
+ except json.JSONDecodeError:
753
+ print("Invalid JSON detected.")
754
+ return None
755
+
756
+
757
+ async def answer(client, question, pdf_content, file_name):
758
+
759
+ docs = await get_docs(question, pdf_content, file_name)
760
+ context = "\n\n".join(docs)
761
+ message = f"CONTEXT:\n\n{context}\n\nQUESTION :\n\n{question}\n\n"
762
+ model = "meta-llama/llama-3-70b-instruct:nitro"
763
+ messages = [
764
+ {"role": "system", "content": QuestionRouter},
765
+ {"role": "user", "content": message},
766
+ ]
767
+ response_str = await client.chat.completions.create(
768
+ messages=messages, model=model, temperature=0
769
+ )
770
+ print(response_str)
771
+ source = json.loads(response_str.choices[0].message.content)
772
+ print(source)
773
+
774
+ if source["datasource"].lower() == "vectorstore":
775
+ print("---ROUTE QUESTION TO RAG---")
776
+ data_source = "vectorstore"
777
+ message = f"CONTEXT:\n\n{context}\n\nQUESTION:\n\n{question}\n\nANSWER:\n"
778
+ model = "meta-llama/llama-3-70b-instruct:nitro"
779
+ messages = [
780
+ {"role": "system", "content": GenerationPrompt},
781
+ {"role": "user", "content": message},
782
+ ]
783
+ output = await client.chat.completions.create(
784
+ messages=messages, model=model, temperature=0
785
+ )
786
+
787
+ elif source["datasource"].lower() == "missing_information":
788
+ print("---NO SUFFICIENT INFORMATION---")
789
+ data_source = "missing information"
790
+ message = f"CONTEXT:\n\n{context}\n\nQUESTION:\n\n{question}\n\nANSWER:\n"
791
+ model = "meta-llama/llama-3-70b-instruct:nitro"
792
+ messages = [
793
+ {"role": "system", "content": MissingInformation},
794
+ {"role": "user", "content": message},
795
+ ]
796
+ output = await client.chat.completions.create(
797
+ messages=messages, model=model, temperature=0
798
+ )
799
+
800
+ return output
801
+
802
+
803
+ async def process_queries(queries, pdf_content, file_name):
804
+ # Run the `answer` function concurrently for all queries
805
+ async_client = openai.AsyncClient(
806
+ api_key=OPENROUTER_API_KEY, base_url="https://openrouter.ai/api/v1"
807
+ )
808
+ async with async_client as aclient:
809
+ tasks = [
810
+ asyncio.create_task(answer(aclient, query, pdf_content, file_name))
811
+ for query in queries
812
+ ]
813
+ responses = await asyncio.gather(*tasks)
814
+
815
+ results = [response.choices[0].message.content for response in responses]
816
+ return results
817
+
818
+
819
+ async def web_search(session, question):
820
+ data = {
821
+ "topic": "",
822
+ "description": question,
823
+ "user_id": "",
824
+ "user_name": "",
825
+ "internet": True,
826
+ "output_format": "report_table",
827
+ "data_format": "No presets",
828
+ }
829
+ async with session.post(
830
+ "https://pvanand-search-generate-staging.hf.space/generate_report",
831
+ json=data,
832
+ headers={"Content-Type": "application/json"},
833
+ ) as response:
834
+ print(f"Status: {response.status}")
835
+ print(f"Headers: {response.headers}")
836
+ content = await response.text()
837
+ print(f"Content: {content[:200]}...") # Print first 200 chars of content
838
+ if response.headers.get('Content-Type', '').startswith('application/json'):
839
+ return await response.json()
840
+ else:
841
+ raise ValueError(f"Unexpected content type: {response.headers.get('Content-Type')}")
842
+
843
+
844
+ async def other_info(pdf_content, file_name):
845
+ data = await industry(pdf_content, file_name)
846
+ industry_company = data.get("industry")
847
+ niche = data.get("niche")
848
+
849
+ # Define the questions for each category
850
+ questions = {
851
+ "Risk Involved": f"What are risk involved in the starting a {niche} business in {industry_company}?",
852
+ "Barrier To Entry": f"What are barrier to entry for a {niche} business in {industry_company}?",
853
+ "Competitors": f"Who are the main competitors in the market for {niche} business in {industry_company}?",
854
+ "Challenges": f"What are in the challenges in the {niche} business for {industry_company}?",
855
+ }
856
+
857
+ # Fetch the results for each category
858
+ results = {}
859
+ async with aiohttp.ClientSession() as session:
860
+ tasks = [web_search(session, question) for question in questions.values()]
861
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
862
+
863
+ for type_, response in zip(questions, responses):
864
+ if isinstance(response, Exception):
865
+ results[type_] = {"error": str(response)}
866
+ else:
867
+ results[type_] = markdown(response)
868
+
869
+ return results
870
+
871
+ @cache(expire=604800)
872
+ async def upload_pitchdeck(pdf_content: bytes, file_name: str, Funding: float):
873
+
874
+ # Assuming process_queries and other_info are your own defined async functions
875
+ query_results = await process_queries(queries, pdf_content, file_name)
876
+ other_info_results = await other_info(pdf_content, file_name)
877
+ grading_results = json.loads(investment(queries, query_results, other_info_results,Funding))
878
+
879
+ return {
880
+ "queries": queries,
881
+ "query_results": query_results,
882
+ "other_info_results": other_info_results,
883
+ "grading_results": grading_results
884
+ }
885
+
886
+
887
+
888
+ @app.post("/investor")
889
+ async def process_pitchdeck(file: UploadFile = File(...),Funding: float = Form(...)):
890
+ if not file:
891
+ raise HTTPException(status_code=400, detail="PDF file not provided")
892
+ pdf_content = await file.read()
893
+ file_name = file.filename
894
+
895
+
896
+ return await upload_pitchdeck(pdf_content, file_name,Funding)
897
+
898
+ app.add_middleware(
899
+ CORSMiddleware,
900
+ allow_origins=["*"],
901
+ allow_credentials=True,
902
+ allow_methods=["*"],
903
+ allow_headers=["*"],
904
+ )
905
+
906
+
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.4.0
3
+ cachetools==5.3.3
4
+ certifi==2024.2.2
5
+ charset-normalizer==3.3.2
6
+ distro==1.9.0
7
+ exceptiongroup==1.2.1
8
+ google-ai-generativelanguage==0.6.4
9
+ google-api-core==2.19.0
10
+ google-api-python-client==2.131.0
11
+ google-auth==2.29.0
12
+ google-auth-httplib2==0.2.0
13
+ google-generativeai==0.5.4
14
+ googleapis-common-protos==1.63.0
15
+ grpcio==1.64.0
16
+ grpcio-status==1.62.2
17
+ h11==0.14.0
18
+ httpcore==1.0.5
19
+ httplib2==0.22.0
20
+ httpx==0.27.0
21
+ idna==3.7
22
+ numpy==1.26.4
23
+ openai==1.30.5
24
+ pandas==2.2.2
25
+ Pillow==9.5.0
26
+ proto-plus==1.23.0
27
+ protobuf==4.25.3
28
+ pyasn1==0.6.0
29
+ pyasn1_modules==0.4.0
30
+ pydantic==2.7.2
31
+ pydantic_core==2.18.3
32
+ PyMuPDF==1.24.5
33
+ PyMuPDFb==1.24.3
34
+ pyparsing==3.1.2
35
+ python-dateutil==2.9.0.post0
36
+ python-dotenv==1.0.1
37
+ pytz==2024.1
38
+ requests==2.32.3
39
+ rsa==4.9
40
+ six==1.16.0
41
+ sniffio==1.3.1
42
+ tqdm==4.66.4
43
+ typing_extensions==4.12.0
44
+ tzdata==2024.1
45
+ uritemplate==4.1.1
46
+ urllib3==2.2.1
47
+ psycopg2-binary==2.9.9
48
+ pinecone-client==4.1.0
49
+ cohere==5.5.4
50
+ tiktoken==0.7.0
51
+ html2text == 2024.2.26
52
+ mistune==3.0.2
53
+ tenacity==8.3.0
54
+ streamlit==1.35.0
55
+ pdfkit==1.0.0
56
+ Markdown==3.6
57
+ xhtml2pdf== 0.2.16
58
+ reportlab==4.2.0
59
+ beautifulsoup4==4.12.3
60
+ fastapi==0.111.0
61
+ uvicorn==0.29.0
62
+ onnxruntime==1.18.0
63
+ aiohttp==3.9.5
64
+ fastapi-cache2