ashishbangwal commited on
Commit
206ef5f
·
1 Parent(s): 9b50757

latest updates to investor agent [UI + backend-logic change]

Browse files
main.py CHANGED
@@ -1,910 +1,84 @@
1
- import os
2
- import uuid
3
- import psycopg2
4
- import time
5
- import re
6
- import asyncio
7
- import cohere
8
- import numpy
9
- import streamlit as st
10
- import pdfkit
11
- import json
12
- import requests
13
- import tempfile
14
- import mistune
15
- import markdown as md
16
- import psycopg2
17
- import html2text
18
- from typing import List, Tuple, Dict
19
- from pinecone import Pinecone, ServerlessSpec
20
- import openai
21
- import os
22
- import pymupdf
23
- import tiktoken
24
- import google.generativeai as gemini
25
- from PIL import Image
26
- from PIL import PngImagePlugin # important to avoid google_genai AttributeError
27
- import json
28
- import hashlib
29
- from dotenv import load_dotenv
30
- from classifier import Classifier
31
- from tenacity import retry, stop_after_attempt, wait_random_exponential
32
- from fastapi import FastAPI, HTTPException, UploadFile, File, Form
33
- from fastapi.middleware.cors import CORSMiddleware
34
- from fastapi_cache import FastAPICache
35
- from fastapi_cache.backends.inmemory import InMemoryBackend
36
- from fastapi_cache.decorator import cache
37
- import aiohttp
38
-
39
- app = FastAPI()
40
-
41
-
42
- @app.on_event("startup")
43
- async def startup():
44
- FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache")
45
-
46
-
47
- load_dotenv()
48
-
49
- TOGETHER_API_KEY = os.getenv("TOGETHER_API")
50
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
51
- COHERE_API = os.getenv("COHERE_API")
52
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
53
- OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
54
-
55
- gemini.configure(api_key=GEMINI_API_KEY)
56
-
57
- client = openai.OpenAI(base_url="https://api.together.xyz/v1", api_key=TOGETHER_API_KEY)
58
-
59
- SysPromptDefault = "You are now in the role of an expert AI."
60
- GenerationPrompt = """You are an expert AI whose task is to ANSWER the user's QUESTION using the provided CONTEXT.
61
- Forget everything you know, Fully rely on CONTEXT to provide the answer.
62
- Follow these steps:
63
- 1. Think deeply and multiple times about the user's QUESTION. You must understand the intent of their question and provide the most appropriate answer.
64
- 2. Choose the most relevant content from the CONTEXT that addresses the user's question and use it to generate an answer.
65
- Formating Instructions:
66
- Respond only in markdown format; don't use big headings"""
67
- QuestionRouter = """ You are an expert investor, You must identify if the provided CONTEXT can answer the user QUESTION.
68
- 1 vectorstore : The provided CONTEXT is sufficient to answer the question.
69
- 2 missing_information : The provided CONTEXT does not contains the answer.
70
- output options: 'vectorstore' OR 'missing_information'.The output must be a valid JSON.Do not add any additional comments.
71
- Output format:
72
- {
73
- "datasource":"identified option"
74
- }
75
- Return the a valid JSON with a single key 'datasource' and no preamble or explanation. Question to identify: QUESTION """
76
- MissingInformation = """You are an expert in identifying missing information in a given CONTEXT to answer a QUESTION effectively. Your task is to analyze the CONTEXT, and pinpoint the missing content needed for each QUESTION. Take your time to process the information thoroughly and provide a list output without any additional comments. The output format should be valid markdown list , without any additional comments:
77
- """
78
- SummaryPrompt = """You are an expert AI specializing in document summarization. You have been refining your skills in text summarization and data extraction for over a decade, ensuring the highest accuracy and clarity. Your task is to read raw data from a PDF file page by page and provide a detailed summary of the CONTEXT while ensuring all numerical data is included in the summary without alteration. The output should be in Markdown format, with appropriate headers and lists to enhance readability. Follow these instructions:
79
- 1.Summarize the Text: Provide a concise summary of the CONTEXT, capturing the main points and essential information.
80
- 2.Retain Numerical Data: Ensure all numerical data (e.g., dates, statistics, financial figures, percentages, measurements) is included in the summary.
81
- 3.Markdown Format: Format the output in Markdown, using headers, lists, and other Markdown elements appropriately.
82
- Note: Whenever the CONTEXT is about a TEAM, DO NOT summarize; instead, output the content in a neat markdown format with Names and previous designation of the TEAM.
83
- """
84
- IndustryPrompt = """You are a business strategy consultant. You have been identifying niche markets and industries for companies across various sectors for over 20 years. Your expertise lies in analyzing detailed CONTEXT to accurately pinpoint the niche and industry of a business.
85
- Objective: Identify the niche and industry of a business by analyzing the provided CONTEXT.
86
- Steps to follow:
87
- Read the context: Carefully read the provided information to understand the business's products, services, target audience, and unique value propositions.
88
- Determine the industry: Based on the provided CONTEXT, identify the primary industry to which the business belongs. Consider factors such as the type of products/services offered, the market served, and industry-specific terminology.
89
- Identify the niche: Analyze the details to pinpoint the specific niche within the industry. Look for unique aspects of the business, specialized market segments, or specific customer needs that the business addresses.
90
- Provide output in JSON format: Clearly state the identified industry and niche in a JSON format. Ensure your reasoning supports the identified industry and niche.The output should JSON ,Do not add any additional format.
91
- Output format:
92
- {
93
- "industry": "Identified industry here",
94
- "niche": "Identified niche here",
95
- "reasoning": "Explanation of how the industry and niche were identified based on the context"
96
- }
97
- Take a deep breath and work on this problem step-by-step.
98
- """
99
-
100
- Investment = """You are a professional financial analyst evaluating sectors for investment potential. Your task is to:
101
-
102
- 1. Identify the sector from the provided CONTEXT.
103
- 2. Grade only the specified KEYS on a scale of 1-10, with higher grades indicating better investment potential. Take a conservative approach in grading.
104
- 3. Provide reasoning for each grade considering both qualitative and quantitative factors, including the FUNDING information.
105
- 4. Assign weights to each section (total should equal 1).
106
- 5. Calculate an overall weighted score.
107
-
108
- Use only the information given in the CONTEXT and the FUNDING provided. Be conservative in your grading to reflect investment risks. Output your analysis in the following JSON format:
109
-
110
- ```json
111
- {
112
- "sector": "Sector name",
113
- "sections": [
114
- {
115
- "section": "Key from Context",
116
- "score": "Grade (1-10)",
117
- "weight": "Weight (0-1)",
118
- "reasoning": "Detailed analysis including funding considerations"
119
- }
120
- ],
121
- "overall_score": "Calculated weighted score"
122
- }
123
- ```
124
-
125
- Grade only these KEYS from the CONTEXT:
126
- 1. "What are the company's projected revenue, expenses, and profits for the future and cash flow projections?"
127
- 2. "What is the founder's experience and track record, along with the key team members' bios, background checks, and their roles and responsibilities?"
128
- 3. "How does the company's product/service differentiate itself from competitors in the market?"
129
- 4. "What issue or challenge is the company addressing?"
130
- 5. "Risks Involved"
131
- 6. "Barrier To Entry"
132
- 7. "Competitors"
133
- 8. "Challenges"
134
-
135
- Additionally, consider the FUNDING provided by the user:
136
- - The FUNDING will be classified as follows:
137
- * Low: Less than $1 million
138
- * Medium: $1 million to $10 million
139
- * High: More than $10 million
140
- - Adjust your scoring based on the funding classification:
141
- * For Low funding: Reduce scores by 1-2 points where relevant.
142
- * For Medium funding: Keep scores as they would be without considering funding.
143
- * For High funding: Increase scores by 1-2 points where relevant.
144
- - Incorporate the funding information into your analysis of each relevant key.
145
- - Consider how the funding level impacts various aspects such as projected financials, ability to execute plans, competitive positioning, and risk mitigation.
146
- - Reflect the impact of funding in your scoring and reasoning for each relevant key.
147
- -Don't explicitly mention the original funding in the answer but use them to give reasoning.
148
-
149
-
150
- Provide your analysis based on the CONTEXT and FUNDING that will be given.
151
- """
152
-
153
- queries = [
154
- "What is the company's product/service, and what are its key features?",
155
- "Who is the target customer for the company's product/service, and what problem does it solve for them?",
156
- "What are the company's revenue streams?",
157
- "How does the company price its products/services?"
158
- "What are the key cost drivers and profit margins for the company?",
159
- "What opportunities for growth and expansion does the company foresee?",
160
- "Who is the target market for the company's product/service, and how does the company plan to reach them?",
161
- "What sales channels and distribution partnerships does the company have in place?",
162
- "How is the company's marketing budget allocated?",
163
- "What is the company's historical financial performance, including growth rate?",
164
- "What are the company's projected revenue, expenses, and profits for the future and cash flow projections?",
165
- "What is the founder's experience and track record, along with the key team members' bios, background checks, and their roles and responsibilities?",
166
- "How does the company's product/service differentiate itself from competitors in the market?",
167
- "What issue or challenge is the company addressing?",
168
- ]
169
-
170
- document_processing_event = asyncio.Event()
171
- document_processing_event.set()
172
-
173
-
174
- def get_digest(pdf_content):
175
- h = hashlib.sha256()
176
- h.update(pdf_content) # Hash the binary content of the PDF
177
- return h.hexdigest()
178
-
179
-
180
- def response(
181
- message: object,
182
- model: object = "meta-llama/llama-3-70b-instruct:nitro",
183
- SysPrompt: object = SysPromptDefault,
184
- temperature: object = 0.2,
185
- ) -> object:
186
- """
187
- :rtype: object
188
- """
189
- client = openai.OpenAI(
190
- api_key=OPENROUTER_API_KEY,
191
- base_url="https://openrouter.ai/api/v1",
192
- )
193
-
194
- messages = [
195
- {"role": "system", "content": SysPrompt},
196
- {"role": "user", "content": message},
197
- ]
198
-
199
- @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
200
- def completion_with_backoff(**kwargs):
201
- print("RETRY")
202
- return client.chat.completions.create(**kwargs)
203
-
204
- try:
205
- response = completion_with_backoff(
206
- model=model,
207
- messages=messages,
208
- temperature=temperature,
209
- frequency_penalty=0.2,
210
- )
211
- return response.choices[0].message.content
212
- except Exception as e:
213
- print(f"An error occurred: {e}")
214
-
215
-
216
- def number_of_tokens(texts: List[str]) -> List[int]:
217
- """
218
- Calculate the number of tokens in a batch of strings.
219
- """
220
- model = tiktoken.encoding_for_model("gpt-3.5-turbo")
221
- encodings = model.encode_batch(texts)
222
- num_of_tokens = [len(encoding) for encoding in encodings]
223
- return num_of_tokens
224
-
225
-
226
- def limit_tokens(input_string, token_limit=5500):
227
- """
228
-
229
- Limit tokens sent to the model
230
-
231
- """
232
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
233
- return encoding.decode(encoding.encode(input_string)[:token_limit])
234
-
235
-
236
- def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
237
- "Takes image path and extract information from it, and return it as text."
238
-
239
- # Start Classifier inference session
240
- classifier = Classifier("graph_classifierV2_B.onnx")
241
- # Model for img to text
242
- model = gemini.GenerativeModel("gemini-1.5-flash")
243
-
244
- description_prompt = f"You are provided with the images extracted from a pitch-deck and some text surrounding the image from the same pitch deck. Extract all the factual information that the image is trying to communicate through line charts, area line charts, bar charts, pie charts, tables exectra. Use OCR to extract numerical figures and include them in the information. If the image does not have any information like its a blank image or image of a person then response should be NOTHING. Do not add any additional comments or markdown, just give information. \n\n SURROUNDING TEXT \n\n{text}"
245
 
246
- img_list = []
247
-
248
- for pixmap in pixmap_list:
249
- try:
250
- img_list.append(
251
- Image.frombytes(
252
- mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
253
- )
254
- )
255
- except Exception as e:
256
- print(e)
257
-
258
- graph_image = classifier.classify(img_list)
259
- print(graph_image)
260
-
261
- response_list = []
262
-
263
- for idx, is_graph in enumerate(graph_image):
264
- if is_graph:
265
- response = model.generate_content(
266
- [description_prompt, img_list[idx]], stream=False
267
- )
268
- print("\n\n", response.text, "\n\n")
269
- response_list.append(str(response.text))
270
-
271
- return response_list
272
-
273
-
274
- def extract_content(pdf_content: bytes) -> List[Tuple[str, int]]:
275
- """
276
- Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
277
- and page number containing that text.
278
- """
279
- print("Extract content called ")
280
- pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")
281
-
282
- pages_content = []
283
- refered_xref = []
284
- for page_number in range(pdf_doc.page_count):
285
- page_content = ""
286
-
287
- # extracting text content
288
- page = pdf_doc.load_page(page_number)
289
- text_content = str(page.get_text()).replace("\n", "\t")
290
- page_content += text_content
291
-
292
- # extracting image content
293
- image_list = page.get_image_info(xrefs=True)
294
- pixmap_list = []
295
- for img_info in image_list:
296
- xref = img_info["xref"]
297
- if xref not in refered_xref:
298
- # if xref not in refered_xref:
299
- try:
300
- img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
301
- pixmap_list.append(img_pixmap)
302
- refered_xref.append(xref)
303
- except ValueError as e:
304
- print(f"Skipping image with due to error: {e}")
305
- if len(pixmap_list) > 0:
306
- img_content = extract_image_content(
307
- pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
308
- )
309
- page_content = page_content + "\n\n" + "\n\n".join(img_content)
310
-
311
- pages_content.append(page_content)
312
-
313
- num_tokens = number_of_tokens(pages_content)
314
-
315
- final_data = []
316
-
317
- # Logic to handle case when page content > 512 tokens
318
- for e, n_token in enumerate(num_tokens):
319
- if n_token > 500:
320
- n_parts = numpy.ceil(n_token / 500).astype(int)
321
- len_content = len(pages_content[e])
322
- part_size = len_content // n_parts
323
- start, end = 0, part_size
324
- temp = []
325
- for _ in range(n_parts):
326
- temp.append((pages_content[e][start:end], e + 1))
327
- start = end
328
- end = end + part_size
329
- final_data += temp
330
- else:
331
- final_data.append((pages_content[e], e + 1))
332
-
333
- pdf_doc.close()
334
- print(final_data)
335
- return final_data
336
-
337
-
338
- def markdown(output):
339
- report_html = output.get("report", "")
340
- references = output.get("references", {})
341
- references_markdown = ""
342
-
343
- for url, content in references.items():
344
- # Making the URL clickable in pure HTML
345
- clickable_url = f'<a href="{url}">{url}</a>'
346
- references_markdown += f"<details><summary>{clickable_url}</summary>\n\n{html2text.html2text(content)}</details>\n\n"
347
-
348
- combined_markdown = ""
349
- if report_html.strip(): # Check if report_html is not empty
350
- # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily
351
- report_markdown = html2text.html2text(report_html)
352
- # Remove unwanted newlines within Markdown headings
353
- report_markdown = report_markdown.replace("\n", " ").replace(" ", "\n")
354
- combined_markdown += report_markdown + "\n\n"
355
- combined_markdown += references_markdown
356
- return combined_markdown
357
-
358
-
359
- def pinecone_server():
360
- pc = Pinecone(api_key=PINECONE_API_KEY)
361
- index_name = "investment"
362
- if index_name not in pc.list_indexes().names():
363
- pc.create_index(
364
- index_name,
365
- dimension=1024,
366
- metric="cosine",
367
- spec=ServerlessSpec(cloud="aws", region="us-east-1"),
368
- )
369
- time.sleep(1)
370
- index = pc.Index(index_name)
371
- index.describe_index_stats()
372
- return index
373
-
374
-
375
- def fetch_vectorstore_from_db(file_id):
376
- conn = psycopg2.connect(
377
- dbname="postgres",
378
- user="postgres.vjbkvfmqsaebxlnvvjtm",
379
- password="FPaN3iV1fuWteBON",
380
- host="aws-0-ap-south-1.pooler.supabase.com",
381
- port="6543",
382
- )
383
- cur = conn.cursor()
384
- create_table_query = """
385
- CREATE TABLE IF NOT EXISTS investment_research_pro (
386
- file_id VARCHAR(1024) PRIMARY KEY,
387
- file_name VARCHAR(1024),
388
- name_space VARCHAR(1024)
389
-
390
- );
391
- """
392
- cur.execute(create_table_query)
393
- conn.commit()
394
- fetch_query = """
395
- SELECT name_space
396
- FROM investment_research_pro
397
- WHERE file_id = %s;
398
- """
399
- cur.execute(fetch_query, (file_id,))
400
- result = cur.fetchone()
401
- cur.close()
402
- conn.close()
403
- if result:
404
- return result[0]
405
- return None
406
-
407
-
408
- def get_next_namespace():
409
- conn = psycopg2.connect(
410
- dbname="postgres",
411
- user="postgres.vjbkvfmqsaebxlnvvjtm",
412
- password="FPaN3iV1fuWteBON",
413
- host="aws-0-ap-south-1.pooler.supabase.com",
414
- port="6543",
415
- )
416
- cur = conn.cursor()
417
- cur.execute("SELECT COUNT(*) FROM investment_research_pro")
418
- count = cur.fetchone()[0]
419
- next_namespace = f"pdf-{count + 1}"
420
- cur.close()
421
- conn.close()
422
- return next_namespace
423
-
424
-
425
- def insert_data(file_id, file_name, name_space):
426
-
427
- print("inserted")
428
- conn = psycopg2.connect(
429
- dbname="postgres",
430
- user="postgres.vjbkvfmqsaebxlnvvjtm",
431
- password="FPaN3iV1fuWteBON",
432
- host="aws-0-ap-south-1.pooler.supabase.com",
433
- port="6543",
434
- )
435
- cur = conn.cursor()
436
- create_table_query = """
437
- CREATE TABLE IF NOT EXISTS investment_research_pro (
438
- file_id VARCHAR(1024) PRIMARY KEY,
439
- file_name VARCHAR(1024),
440
- name_space VARCHAR(255)
441
- );
442
- """
443
- cur.execute(create_table_query)
444
- conn.commit()
445
- insert_query = """
446
- INSERT INTO investment_research_pro (file_id, file_name, name_space)
447
- VALUES (%s, %s, %s)
448
- ON CONFLICT (file_id) DO NOTHING;
449
- """
450
- cur.execute(insert_query, (file_id, file_name, name_space))
451
- conn.commit()
452
- cur.close()
453
- conn.close()
454
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
- def create_documents(page_contents):
457
- documents = []
458
 
459
- for content, page_number in page_contents:
460
- doc = {
461
- "page_content": content,
462
- "metadata": {"page_number": page_number, "original_content": content},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  }
464
- documents.append(doc)
465
-
466
- return documents
467
-
468
-
469
- def embed_and_upsert(documents, name_space):
470
- chunks = [doc["page_content"] for doc in documents]
471
- pinecone_index = pinecone_server()
472
- embeddings_response = client.embeddings.create(
473
- input=chunks, model="BAAI/bge-large-en-v1.5"
474
- ).data
475
- embeddings = [i.embedding for i in embeddings_response]
476
- pinecone_data = []
477
- for doc, embedding in zip(documents, embeddings):
478
- i = str(uuid.uuid4())
479
- pinecone_data.append(
480
- {"id": i, "values": embedding, "metadata": doc["metadata"]}
481
- )
482
-
483
- pinecone_index.upsert(vectors=pinecone_data, namespace=name_space)
484
-
485
-
486
- def embedding_creation(pdf_content, name_space):
487
- data = extract_content(pdf_content)
488
- # text_data = [i[0] for i in data]
489
- documents = create_documents(data)
490
- embed_and_upsert(documents, name_space)
491
- print("Embeddings created and upserted successfully into Pinecone.")
492
-
493
-
494
- def embed(question):
495
- embeddings_response = client.embeddings.create(
496
- input=[question],
497
- model="BAAI/bge-large-en-v1.5",
498
- ).data
499
- embeddings = embeddings_response[0].embedding
500
- return embeddings
501
-
502
-
503
- def process_rerank_response(rerank_response, docs):
504
- rerank_docs = []
505
- for item in rerank_response.results:
506
- index = item.index
507
- if 0 <= index < len(docs):
508
- rerank_docs.append(docs[index])
509
- else:
510
- print(f"Warning: Index {index} is out of range for documents list.")
511
- return rerank_docs
512
-
513
-
514
- async def get_docs(question, pdf_content, file_name):
515
- global document_processing_event
516
- index = pinecone_server()
517
- co = cohere.Client(COHERE_API)
518
- xq = embed(question)
519
-
520
- await document_processing_event.wait()
521
- file_id = get_digest(pdf_content)
522
- existing_namespace = fetch_vectorstore_from_db(file_id)
523
-
524
- if existing_namespace:
525
- print("Document already exists. Using existing namespace.")
526
- name_space = existing_namespace
527
  else:
528
- document_processing_event.clear()
529
- print("evet stopped")
530
- print("Document is new. Creating embeddings and new namespace.")
531
- name_space = get_next_namespace()
532
- print(name_space)
533
- embedding_creation(pdf_content, name_space)
534
- insert_data(file_id, file_name, name_space)
535
- print("Sleep complete....")
536
- # except Exception as e:
537
- # print(e)
538
- # finally:
539
- print("finally called")
540
- document_processing_event.set()
541
-
542
- # Query is now inside the lock to ensure it happens after any new document processing
543
- res = index.query(namespace=name_space, vector=xq, top_k=5, include_metadata=True)
544
-
545
- print(res)
546
- docs = [x["metadata"]["original_content"] for x in res["matches"]]
547
-
548
- if not docs:
549
- print("No matching documents found.")
550
- return []
551
-
552
- results = co.rerank(
553
- query=question, documents=docs, top_n=3, model="rerank-english-v3.0"
554
- )
555
- reranked_docs = process_rerank_response(results, docs)
556
- return reranked_docs
557
-
558
-
559
- async def industry(pdf_content, file_name):
560
- question = (
561
- "What is the name and its specific niche business this document pertains to."
562
- )
563
- docs = await get_docs(question, pdf_content, file_name)
564
- context = "\n\n".join(docs)
565
- message = f"CONTEXT\n\n{context}\n\n"
566
- model = "meta-llama/llama-3-70b-instruct:nitro"
567
- response_str = response(
568
- message=message, model=model, SysPrompt=IndustryPrompt, temperature=0
569
- )
570
- industry = json.loads(response_str)
571
- print(industry)
572
- return industry
573
-
574
-
575
- def split_into_chunks(input_string, token_limit=4500):
576
- # Initialize the tokenizer for the model
577
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
578
-
579
- # Encode the input string to get the tokens
580
- tokens = encoding.encode(input_string)
581
-
582
- # List to store chunks
583
- chunks = []
584
- start = 0
585
-
586
- # Iterate over the tokens and split into chunks
587
- while start < len(tokens):
588
- end = start + token_limit
589
- if end >= len(tokens):
590
- chunk_tokens = tokens[start:]
591
- else:
592
- break_point = end
593
- while break_point > start and tokens[break_point] not in encoding.encode(
594
- " "
595
- ):
596
- break_point -= 1
597
-
598
- if break_point == start:
599
- chunk_tokens = tokens[start:end]
600
- else:
601
- chunk_tokens = tokens[start:break_point]
602
- end = break_point
603
-
604
- chunk = encoding.decode(chunk_tokens)
605
- chunks.append(chunk)
606
- start = end
607
-
608
- return chunks
609
-
610
-
611
- def further_split_chunk(chunk, token_limit):
612
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
613
- tokens = encoding.encode(chunk)
614
- sub_chunks = []
615
- start = 0
616
-
617
- while start < len(tokens):
618
- end = start + token_limit
619
- if end >= len(tokens):
620
- sub_chunk_tokens = tokens[start:]
621
- else:
622
- break_point = end
623
- while break_point > start and tokens[break_point] not in encoding.encode(
624
- " "
625
- ):
626
- break_point -= 1
627
-
628
- if break_point == start:
629
- sub_chunk_tokens = tokens[start:end]
630
- else:
631
- sub_chunk_tokens = tokens[start:break_point]
632
- end = break_point
633
-
634
- sub_chunk = encoding.decode(sub_chunk_tokens)
635
- sub_chunks.append(sub_chunk)
636
- start = end
637
-
638
- return sub_chunks
639
-
640
-
641
- # Define the investment function
642
- def investment(queries, query_results, other_info_results, Funding):
643
-
644
- # Combine queries and query_results into a dictionary
645
- combined_results = {q: r for q, r in zip(queries[-4:], query_results[-4:])}
646
-
647
- # Extract keys and answers from the other_info_results and update the combined_results dictionary
648
- for key, value in other_info_results.items():
649
- if isinstance(value, str): # Check if the value is a string
650
- combined_results[key] = value.split("<details><summary>")[0].strip()
651
- else:
652
- combined_results[key] = value
653
- print(combined_results)
654
- message = f"CONTEXT:\n\n{json.dumps(combined_results, indent=4)}\n\nFUNDING:\n\n{Funding}\n\n"
655
-
656
- sys_prompt = Investment
657
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
658
- sys_prompt_token_size = len(encoding.encode(sys_prompt))
659
 
660
- max_model_tokens = 7000
661
- max_chunk_size = 7000 # Adjust to leave more buffer space
662
-
663
- chunks = split_into_chunks(message, token_limit=max_chunk_size)
664
-
665
- model = "anthropic/claude-3.5-sonnet"
666
- responses = []
667
- tokens_used = 0
668
- max_tokens_per_minute = 7000
669
-
670
- for chunk in chunks:
671
- chunk_token_size = len(encoding.encode(chunk))
672
- combined_message = f"{sys_prompt}\n{chunk}"
673
- combined_token_size = len(encoding.encode(combined_message))
674
-
675
- print(
676
- f"Token size of the combined message and SysPrompt for this chunk: {combined_token_size}"
677
- )
678
- print(f"Chunk token size: {chunk_token_size}")
679
- print(f"SysPrompt token size: {sys_prompt_token_size}")
680
-
681
- if combined_token_size > max_model_tokens:
682
- print(
683
- f"Warning: Combined token size ({combined_token_size}) exceeds the model's limit ({max_model_tokens}). Adjusting chunk size."
684
- )
685
- sub_chunks = further_split_chunk(
686
- chunk, max_model_tokens - sys_prompt_token_size
687
- )
688
- for sub_chunk in sub_chunks:
689
- sub_chunk_token_size = len(encoding.encode(sub_chunk))
690
- print(sub_chunk_token_size)
691
- if sub_chunk_token_size > 500:
692
- sub_combined_message = f"{sys_prompt}\n{sub_chunk}"
693
- sub_combined_token_size = len(encoding.encode(sub_combined_message))
694
- if sub_combined_token_size <= max_model_tokens:
695
- response_str = response(
696
- message=sub_chunk,
697
- model=model,
698
- SysPrompt=sys_prompt,
699
- temperature=0,
700
- )
701
- print(response_str)
702
- json_part = extract_json(response_str)
703
- if json_part:
704
- responses.append(json_part)
705
- else:
706
- print("Warning: No valid JSON part found in the response.")
707
- tokens_used += sub_combined_token_size
708
- if tokens_used >= max_tokens_per_minute:
709
- print("Waiting for 60 seconds to avoid rate limit.")
710
- time.sleep(60)
711
- tokens_used = 0
712
- else:
713
- if chunk_token_size >= 500:
714
- response_str = response(
715
- message=chunk, model=model, SysPrompt=sys_prompt, temperature=0
716
- )
717
- print(response_str)
718
- json_part = extract_json(response_str)
719
- if json_part:
720
- responses.append(json_part)
721
- else:
722
- print("Warning: No valid JSON part found in the response.")
723
- tokens_used += combined_token_size
724
- if tokens_used >= max_tokens_per_minute:
725
- print("Waiting for 60 seconds to avoid rate limit.")
726
- time.sleep(60)
727
- tokens_used = 0
728
-
729
- combined_json = {"sectors": [], "final_score": 0}
730
- total_score = 0
731
- count = 0
732
-
733
- for response_str in responses:
734
- response_json = json.loads(response_str)
735
- combined_json["sectors"].append(response_json)
736
- total_score += response_json["overall_score"]
737
- count += 1
738
-
739
- if count > 0:
740
- combined_json["final_score"] = total_score / count
741
- final_json = json.dumps(combined_json, indent=4)
742
- print(final_json)
743
- return final_json
744
-
745
-
746
- def extract_json(response_str):
747
- """Extract the JSON part from the response string."""
748
- match = re.search(r"\{.*}", response_str, re.DOTALL)
749
- if match:
750
- json_part = match.group()
751
- try:
752
- json.loads(json_part) # Check if it's valid JSON
753
- return json_part
754
- except json.JSONDecodeError:
755
- print("Invalid JSON detected.")
756
- return None
757
-
758
-
759
- async def answer(client, question, pdf_content, file_name):
760
-
761
- docs = await get_docs(question, pdf_content, file_name)
762
- context = "\n\n".join(docs)
763
- message = f"CONTEXT:\n\n{context}\n\nQUESTION :\n\n{question}\n\n"
764
- model = "meta-llama/llama-3-70b-instruct:nitro"
765
- messages = [
766
- {"role": "system", "content": QuestionRouter},
767
- {"role": "user", "content": message},
768
- ]
769
- response_str = await client.chat.completions.create(
770
- messages=messages, model=model, temperature=0
771
- )
772
- print(response_str)
773
- source = json.loads(response_str.choices[0].message.content)
774
- print(source)
775
-
776
- if source["datasource"].lower() == "vectorstore":
777
- print("---ROUTE QUESTION TO RAG---")
778
- data_source = "vectorstore"
779
- message = f"CONTEXT:\n\n{context}\n\nQUESTION:\n\n{question}\n\nANSWER:\n"
780
- model = "meta-llama/llama-3-70b-instruct:nitro"
781
- messages = [
782
- {"role": "system", "content": GenerationPrompt},
783
- {"role": "user", "content": message},
784
- ]
785
- output = await client.chat.completions.create(
786
- messages=messages, model=model, temperature=0
787
- )
788
-
789
- elif source["datasource"].lower() == "missing_information":
790
- print("---NO SUFFICIENT INFORMATION---")
791
- data_source = "missing information"
792
- message = f"CONTEXT:\n\n{context}\n\nQUESTION:\n\n{question}\n\nANSWER:\n"
793
- model = "meta-llama/llama-3-70b-instruct:nitro"
794
- messages = [
795
- {"role": "system", "content": MissingInformation},
796
- {"role": "user", "content": message},
797
- ]
798
- output = await client.chat.completions.create(
799
- messages=messages, model=model, temperature=0
800
- )
801
-
802
- return output
803
-
804
-
805
- async def process_queries(queries, pdf_content, file_name):
806
- # Run the `answer` function concurrently for all queries
807
- async_client = openai.AsyncClient(
808
- api_key=OPENROUTER_API_KEY, base_url="https://openrouter.ai/api/v1"
809
- )
810
- async with async_client as aclient:
811
- tasks = [
812
- asyncio.create_task(answer(aclient, query, pdf_content, file_name))
813
- for query in queries
814
- ]
815
- responses = await asyncio.gather(*tasks)
816
-
817
- results = [response.choices[0].message.content for response in responses]
818
- return results
819
-
820
-
821
- async def web_search(session, question):
822
- data = {
823
- "topic": "",
824
- "description": question,
825
- "user_id": "",
826
- "user_name": "",
827
- "internet": True,
828
- "output_format": "report_table",
829
- "data_format": "No presets",
830
- }
831
- async with session.post(
832
- "https://pvanand-search-generate-staging.hf.space/generate_report",
833
- json=data,
834
- headers={"Content-Type": "application/json"},
835
- ) as response:
836
- print(f"Status: {response.status}")
837
- print(f"Headers: {response.headers}")
838
- content = await response.text()
839
- print(f"Content: {content[:200]}...") # Print first 200 chars of content
840
- if response.headers.get("Content-Type", "").startswith("application/json"):
841
- return await response.json()
842
- else:
843
- raise ValueError(
844
- f"Unexpected content type: {response.headers.get('Content-Type')}"
845
- )
846
-
847
-
848
- async def other_info(pdf_content, file_name):
849
- data = await industry(pdf_content, file_name)
850
- industry_company = data.get("industry")
851
- niche = data.get("niche")
852
-
853
- # Define the questions for each category
854
- questions = {
855
- "Risk Involved": f"What are risk involved in the starting a {niche} business in {industry_company}?",
856
- "Barrier To Entry": f"What are barrier to entry for a {niche} business in {industry_company}?",
857
- "Competitors": f"Who are the main competitors in the market for {niche} business in {industry_company}?",
858
- "Challenges": f"What are in the challenges in the {niche} business for {industry_company}?",
859
- }
860
-
861
- # Fetch the results for each category
862
- results = {}
863
- async with aiohttp.ClientSession() as session:
864
- tasks = [web_search(session, question) for question in questions.values()]
865
- responses = await asyncio.gather(*tasks, return_exceptions=True)
866
-
867
- for type_, response in zip(questions, responses):
868
- if isinstance(response, Exception):
869
- results[type_] = {"error": str(response)}
870
- else:
871
- results[type_] = markdown(response)
872
-
873
- return results
874
-
875
-
876
- @cache(expire=604800)
877
- async def upload_pitchdeck(pdf_content: bytes, file_name: str, Funding: float):
878
-
879
- # Assuming process_queries and other_info are your own defined async functions
880
- query_results = await process_queries(queries, pdf_content, file_name)
881
- other_info_results = await other_info(pdf_content, file_name)
882
- grading_results = json.loads(
883
- investment(queries, query_results, other_info_results, Funding)
884
- )
885
-
886
- return {
887
- "queries": queries,
888
- "query_results": query_results,
889
- "other_info_results": other_info_results,
890
- "grading_results": grading_results,
891
- }
892
-
893
-
894
- @app.post("/investor")
895
- async def process_pitchdeck(file: UploadFile = File(...), Funding: float = Form(...)):
896
- if not file:
897
- raise HTTPException(status_code=400, detail="PDF file not provided")
898
- pdf_content = await file.read()
899
- file_name = file.filename
900
-
901
- return await upload_pitchdeck(pdf_content, file_name, Funding)
902
-
903
-
904
- app.add_middleware(
905
- CORSMiddleware,
906
- allow_origins=["*"],
907
- allow_credentials=True,
908
- allow_methods=["*"],
909
- allow_headers=["*"],
910
- )
 
1
+ from fastapi import FastAPI, UploadFile, HTTPException
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import asyncio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ from utils.PdfUtils import ProcessPdf
6
+ from utils.HelperFunctions import (
7
+ generate_file_id,
8
+ save_to_database,
9
+ retrieve_from_database,
10
+ )
11
+ from utils.VectorDatabase import AdvancedClient
12
+ from utils.ModelCallingFunctions import (
13
+ industry_finder,
14
+ other_info,
15
+ business_information,
16
+ )
17
 
18
+ app = FastAPI()
 
19
 
20
+ client = AdvancedClient("VectorDB")
21
+
22
+
23
+ @app.post(
24
+ "/get_analysis",
25
+ responses={
26
+ 200: {
27
+ "description": "Successful Response",
28
+ "content": {
29
+ "application/json": {
30
+ "example": {
31
+ "industry": {
32
+ "pitch-deck": "File Name",
33
+ "industry": "XYZ",
34
+ "niche": "ABC",
35
+ },
36
+ "other_info": {
37
+ "Risk Involved": "Markdown",
38
+ "Barrier To Entry": "Markdown",
39
+ "Competitors": "Markdown",
40
+ "Challenges": "Markdown",
41
+ },
42
+ "business_info": {
43
+ "product-and-market": "{...}",
44
+ "team-and-strategy": "{...}",
45
+ "financials": "{...}",
46
+ },
47
+ }
48
+ }
49
+ },
50
  }
51
+ },
52
+ )
53
+ async def get_analysis(pdf_file: UploadFile):
54
+ if not pdf_file:
55
+ raise HTTPException(status_code=400, detail="Pitch PDF file not provided")
56
+ pdf_content = await pdf_file.read()
57
+ pdf_id = generate_file_id(pdf_content)
58
+ file_name = pdf_file.filename
59
+ if pdf_id not in [
60
+ collection.name for collection in client.client.list_collections()
61
+ ]:
62
+ pdf_chunks = ProcessPdf(pdf_content=pdf_content)
63
+ client.create_collection(collection_id=pdf_id, file_datas=pdf_chunks)
64
+
65
+ # Starting of pitch deck information extraction and structuring
66
+ industry_info = industry_finder(collection_id=pdf_id)
67
+ industry_info["pitch-deck"] = file_name
68
+
69
+ other_info_results = asyncio.run(other_info(company_data=industry_info))
70
+
71
+ business_info = asyncio.run(business_information(collection_id=pdf_id))
72
+
73
+ json = {
74
+ "industry": industry_info,
75
+ "other_info": other_info_results,
76
+ "business_info": business_info,
77
+ }
78
+ save_to_database(_id=pdf_id, data=json)
79
+ return json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ # Starting of pitch deck information extraction and structuring
83
+ json = retrieve_from_database(_id=pdf_id)
84
+ return json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,64 +1,142 @@
 
 
 
 
1
  annotated-types==0.7.0
2
  anyio==4.4.0
3
- cachetools==5.3.3
4
- certifi==2024.2.2
 
 
 
 
 
 
 
5
  charset-normalizer==3.3.2
 
 
 
 
 
6
  distro==1.9.0
7
- exceptiongroup==1.2.1
8
- google-ai-generativelanguage==0.6.4
9
- google-api-core==2.19.0
10
- google-api-python-client==2.131.0
11
- google-auth==2.29.0
 
 
 
 
 
 
 
 
 
 
12
  google-auth-httplib2==0.2.0
13
- google-generativeai==0.5.4
14
- googleapis-common-protos==1.63.0
15
- grpcio==1.64.0
16
- grpcio-status==1.62.2
17
  h11==0.14.0
 
18
  httpcore==1.0.5
19
  httplib2==0.22.0
20
- httpx==0.27.0
21
- idna==3.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  numpy==1.26.4
23
- openai==1.30.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  pandas==2.2.2
25
- Pillow==9.5.0
26
- proto-plus==1.23.0
27
- protobuf==4.25.3
 
 
28
  pyasn1==0.6.0
29
  pyasn1_modules==0.4.0
30
- pydantic==2.7.2
31
- pydantic_core==2.18.3
32
- PyMuPDF==1.24.5
33
- PyMuPDFb==1.24.3
34
- pyparsing==3.1.2
 
 
 
 
 
35
  python-dateutil==2.9.0.post0
36
  python-dotenv==1.0.1
 
37
  pytz==2024.1
 
 
 
38
  requests==2.32.3
 
 
 
39
  rsa==4.9
 
40
  six==1.16.0
 
41
  sniffio==1.3.1
42
- tqdm==4.66.4
43
- typing_extensions==4.12.0
 
 
 
 
 
 
 
 
 
 
44
  tzdata==2024.1
45
  uritemplate==4.1.1
46
- urllib3==2.2.1
47
- psycopg2-binary==2.9.9
48
- pinecone-client==4.1.0
49
- cohere==5.5.4
50
- tiktoken==0.7.0
51
- html2text == 2024.2.26
52
- mistune==3.0.2
53
- tenacity==8.3.0
54
- streamlit==1.35.0
55
- pdfkit==1.0.0
56
- Markdown==3.6
57
- xhtml2pdf== 0.2.16
58
- reportlab==4.2.0
59
- beautifulsoup4==4.12.3
60
- fastapi==0.111.0
61
- uvicorn==0.29.0
62
- onnxruntime==1.18.0
63
- aiohttp==3.9.5
64
- fastapi-cache2
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.5
3
+ aiosignal==1.3.1
4
+ altair==5.4.1
5
  annotated-types==0.7.0
6
  anyio==4.4.0
7
+ asgiref==3.8.1
8
+ async-timeout==4.0.3
9
+ attrs==24.2.0
10
+ backoff==2.2.1
11
+ bcrypt==4.2.0
12
+ blinker==1.8.2
13
+ build==1.2.2
14
+ cachetools==5.5.0
15
+ certifi==2024.8.30
16
  charset-normalizer==3.3.2
17
+ chroma-hnswlib==0.7.6
18
+ chromadb==0.5.5
19
+ click==8.1.7
20
+ coloredlogs==15.0.1
21
+ Deprecated==1.2.14
22
  distro==1.9.0
23
+ dnspython==2.6.1
24
+ email_validator==2.2.0
25
+ exceptiongroup==1.2.2
26
+ fastapi==0.114.1
27
+ fastapi-cli==0.0.5
28
+ filelock==3.16.0
29
+ flatbuffers==24.3.25
30
+ frozenlist==1.4.1
31
+ fsspec==2024.9.0
32
+ gitdb==4.0.11
33
+ GitPython==3.1.43
34
+ google-ai-generativelanguage==0.6.6
35
+ google-api-core==2.19.2
36
+ google-api-python-client==2.144.0
37
+ google-auth==2.34.0
38
  google-auth-httplib2==0.2.0
39
+ google-generativeai==0.7.2
40
+ googleapis-common-protos==1.65.0
41
+ grpcio==1.66.1
42
+ grpcio-status==1.62.3
43
  h11==0.14.0
44
+ html2text==2024.2.26
45
  httpcore==1.0.5
46
  httplib2==0.22.0
47
+ httptools==0.6.1
48
+ httpx==0.27.2
49
+ huggingface-hub==0.24.6
50
+ humanfriendly==10.0
51
+ idna==3.8
52
+ importlib_metadata==8.4.0
53
+ importlib_resources==6.4.5
54
+ Jinja2==3.1.4
55
+ jiter==0.5.0
56
+ jsonschema==4.23.0
57
+ jsonschema-specifications==2023.12.1
58
+ kubernetes==30.1.0
59
+ Markdown==3.7
60
+ markdown-it-py==3.0.0
61
+ MarkupSafe==2.1.5
62
+ mdurl==0.1.2
63
+ mmh3==4.1.0
64
+ monotonic==1.6
65
+ mpmath==1.3.0
66
+ multidict==6.1.0
67
+ narwhals==1.6.4
68
  numpy==1.26.4
69
+ oauthlib==3.2.2
70
+ onnxruntime==1.19.2
71
+ openai==1.44.1
72
+ opentelemetry-api==1.27.0
73
+ opentelemetry-exporter-otlp-proto-common==1.27.0
74
+ opentelemetry-exporter-otlp-proto-grpc==1.27.0
75
+ opentelemetry-instrumentation==0.48b0
76
+ opentelemetry-instrumentation-asgi==0.48b0
77
+ opentelemetry-instrumentation-fastapi==0.48b0
78
+ opentelemetry-proto==1.27.0
79
+ opentelemetry-sdk==1.27.0
80
+ opentelemetry-semantic-conventions==0.48b0
81
+ opentelemetry-util-http==0.48b0
82
+ orjson==3.10.7
83
+ overrides==7.7.0
84
+ packaging==24.1
85
  pandas==2.2.2
86
+ pillow==10.4.0
87
+ posthog==3.6.5
88
+ proto-plus==1.24.0
89
+ protobuf==4.25.4
90
+ pyarrow==17.0.0
91
  pyasn1==0.6.0
92
  pyasn1_modules==0.4.0
93
+ pydantic==2.9.1
94
+ pydantic_core==2.23.3
95
+ pydeck==0.9.1
96
+ Pygments==2.18.0
97
+ PyMuPDF==1.24.10
98
+ PyMuPDFb==1.24.10
99
+ pyparsing==3.1.4
100
+ PyPika==0.48.9
101
+ pyproject_hooks==1.1.0
102
+ pysqlite3-binary==0.5.3.post1
103
  python-dateutil==2.9.0.post0
104
  python-dotenv==1.0.1
105
+ python-multipart==0.0.9
106
  pytz==2024.1
107
+ PyYAML==6.0.2
108
+ referencing==0.35.1
109
+ regex==2024.7.24
110
  requests==2.32.3
111
+ requests-oauthlib==2.0.0
112
+ rich==13.8.1
113
+ rpds-py==0.20.0
114
  rsa==4.9
115
+ shellingham==1.5.4
116
  six==1.16.0
117
+ smmap==5.0.1
118
  sniffio==1.3.1
119
+ starlette==0.38.5
120
+ streamlit==1.38.0
121
+ sympy==1.13.2
122
+ tenacity==8.5.0
123
+ tiktoken==0.7.0
124
+ tokenizers==0.20.0
125
+ toml==0.10.2
126
+ tomli==2.0.1
127
+ tornado==6.4.1
128
+ tqdm==4.66.5
129
+ typer==0.12.5
130
+ typing_extensions==4.12.2
131
  tzdata==2024.1
132
  uritemplate==4.1.1
133
+ urllib3==2.2.2
134
+ uvicorn==0.30.6
135
+ uvloop==0.20.0
136
+ watchdog==4.0.2
137
+ watchfiles==0.24.0
138
+ websocket-client==1.8.0
139
+ websockets==13.0.1
140
+ wrapt==1.16.0
141
+ yarl==1.11.1
142
+ zipp==3.20.1
 
 
 
 
 
 
 
 
 
utils/.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ TOGETHER_API="85e577a7bd21434e2d3f1ab2bd7a2750c6db5eb7ddf09cce131655911c93f622"
2
+ GEMINI_API="AIzaSyAVUENQ7n8IkQbef0D5uApIo1VVQrSKN9Y"
3
+ OPENROUTER_API_KEY="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"
4
+ X_API_KEY="44d5c2ac18ced6fc25c1e57dcd06fc0b31fb4ad97bf56e67540671a647465df4"
classifier.py → utils/ChartClassifier.py RENAMED
@@ -13,7 +13,7 @@ class Classifier:
13
  """
14
  img : PIL Image object of shape (B,HxW,C)
15
  """
16
- img = img.resize((192,192))
17
  np_image = np.asarray(img) / 255
18
  return np_image.astype(np.float32)
19
 
 
13
  """
14
  img : PIL Image object of shape (B,HxW,C)
15
  """
16
+ img = img.resize((192, 192))
17
  np_image = np.asarray(img) / 255
18
  return np_image.astype(np.float32)
19
 
utils/HelperFunctions.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ from typing import List
3
+ import hashlib
4
+ import re
5
+ import sqlite3
6
+
7
+ import sqlite3
8
+ import json
9
+
10
+
11
+ def save_to_database(_id, data):
12
+ # Connect to the SQLite database (or create it if it doesn't exist)
13
+ conn = sqlite3.connect("utils/information.db")
14
+ cursor = conn.cursor()
15
+
16
+ # Create the table if it doesn't exist
17
+ cursor.execute(
18
+ """
19
+ CREATE TABLE IF NOT EXISTS json_data (
20
+ id TEXT PRIMARY KEY,
21
+ data TEXT
22
+ )
23
+ """
24
+ )
25
+
26
+ # Insert or replace the data
27
+ cursor.execute(
28
+ """
29
+ INSERT OR REPLACE INTO json_data (id, data)
30
+ VALUES (?, ?)
31
+ """,
32
+ (_id, json.dumps(data)),
33
+ )
34
+
35
+ # Commit the changes and close the connection
36
+ conn.commit()
37
+ conn.close()
38
+
39
+
40
+ def retrieve_from_database(_id):
41
+ conn = sqlite3.connect("utils/information.db")
42
+ cursor = conn.cursor()
43
+
44
+ cursor.execute("SELECT data FROM json_data WHERE id = ?", (_id,))
45
+ result = cursor.fetchone()
46
+
47
+ conn.close()
48
+
49
+ if result:
50
+ return json.loads(result[0])
51
+ else:
52
+ return None
53
+
54
+
55
+ def generate_file_id(file_bytes: bytes) -> str:
56
+ """Generate a Unique file ID for given file."""
57
+
58
+ hash_obj = hashlib.sha256()
59
+ hash_obj.update(file_bytes[:4096])
60
+ file_id = hash_obj.hexdigest()[:63]
61
+ return str(file_id)
62
+
63
+
64
+ def extract_content(text):
65
+ pattern = r"<report-chart>(.*?)</report-chart>"
66
+ matches = re.findall(pattern, text, re.DOTALL)
67
+ return matches[0]
68
+
69
+
70
+ def CountTokens(texts: List[str]) -> List[int]:
71
+ """
72
+ Calculate the number of tokens in a batch of strings.
73
+ """
74
+ model = tiktoken.encoding_for_model("gpt-3.5-turbo")
75
+ encodings = model.encode_batch(texts)
76
+ num_of_tokens = [len(encoding) for encoding in encodings]
77
+ return num_of_tokens
78
+
79
+
80
+ def web_search_result_processor(output):
81
+ """report_html = output.get("report", "")
82
+ references = output.get("references", {})
83
+ references_markdown = ""
84
+
85
+ for url, content in references.items():
86
+ # Making the URL clickable in pure HTML
87
+ clickable_url = f'<a href="{url}">{url}</a>'
88
+ references_markdown += f"<details><summary>{clickable_url}</summary>\n\n{html2text.html2text(content)}</details>\n\n"
89
+
90
+ combined_markdown = ""
91
+ if report_html.strip(): # Check if report_html is not empty
92
+ # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily
93
+ report_markdown = html2text.html2text(report_html)
94
+ # Remove unwanted newlines within Markdown headings
95
+ report_markdown = report_markdown.replace("\n", " ").replace(" ", "\n")
96
+ combined_markdown += report_markdown + "\n\n"
97
+ combined_markdown += references_markdown"""
98
+ r = extract_content(output)
99
+ return r
utils/HyDE.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53acfcd33e1526015a426b059d1636d887bc3ac4e0c7fde62f0e32e456651aa8
3
+ size 111026
utils/ModelCallingFunctions.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from PIL import Image
4
+ from dotenv import load_dotenv
5
+ import json
6
+ import pickle
7
+
8
+ import asyncio
9
+ import aiohttp
10
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
11
+
12
+ from openai import OpenAI, AsyncClient
13
+ import google.generativeai as gemini
14
+
15
+ from .VectorDatabase import AdvancedClient
16
+ from .HelperFunctions import web_search_result_processor
17
+
18
+ from .prompts import PROMPTS
19
+
20
+ load_dotenv("utils/.env")
21
+
22
+ TOGETHER_API = os.getenv("TOGETHER_API")
23
+ GEMINI_API = os.getenv("GEMINI_API")
24
+ X_API_KEY = os.getenv("X_API_KEY")
25
+
26
+ client = AdvancedClient(vector_database_path="VectorDB")
27
+
28
+ with open("utils/HyDE.bin", "rb") as file:
29
+ HyDE = pickle.load(file)
30
+
31
+
32
+ def image_data_extractor(img: Image.Image, text: str) -> str:
33
+ gemini.configure(api_key=GEMINI_API)
34
+ model = gemini.GenerativeModel("gemini-1.5-flash")
35
+ prompt = PROMPTS["gemini-image"].format(text=text)
36
+ response = model.generate_content([prompt, img], stream=False)
37
+ return response.text
38
+
39
+
40
+ def generate_embedding(
41
+ texts: List[str], embedding_model: str = "BAAI/bge-large-en-v1.5"
42
+ ) -> List[List[float]]:
43
+ """Generate Embeddings for the givien pieces of texts."""
44
+
45
+ client = OpenAI(api_key=TOGETHER_API, base_url="https://api.together.xyz/v1")
46
+ embeddings_response = client.embeddings.create(
47
+ input=texts, model=embedding_model
48
+ ).data
49
+ embeddings = [i.embedding for i in embeddings_response]
50
+ return embeddings
51
+
52
+
53
+ def industry_finder(collection_id):
54
+ question = (
55
+ "What is the name and its specific niche business this document pertains to."
56
+ )
57
+ docs = client.retrieve_chunks(
58
+ collection_id=collection_id, query=question, number_of_chunks=5
59
+ )
60
+
61
+ context = "\n\n".join(docs)
62
+ message = f"CONTEXT\n\n{context}\n\n"
63
+ model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
64
+ response_str = response(
65
+ message=message,
66
+ model=model,
67
+ SysPrompt=PROMPTS["industry-finder"],
68
+ temperature=0,
69
+ )
70
+ industry = json.loads(response_str)
71
+ return industry
72
+
73
+
74
+ async def web_search(session, question):
75
+ data = {"query": question, "model_id": "openai/gpt-4o-mini"}
76
+ async with session.post(
77
+ "https://general-chat.elevatics.cloud/search-assistant",
78
+ json=data,
79
+ headers={"X-API-KEY": X_API_KEY, "Content-Type": "application/json"},
80
+ ) as response:
81
+ print(f"Status: {response.status}")
82
+ print(f"Content: {response.content}")
83
+ content = await response.text()
84
+ return content
85
+
86
+
87
+ async def other_info(company_data):
88
+ industry_company = company_data.get("industry")
89
+ niche = company_data.get("niche")
90
+
91
+ # Define the questions for each category
92
+ questions = {
93
+ "Risk Involved": f"What are risk involved in the starting a {niche} business in {industry_company}?, please be concise.",
94
+ "Barrier To Entry": f"What are barrier to entry for a {niche} business in {industry_company}?, please be concise.",
95
+ "Competitors": f"Who are the main competitors in the market for {niche} business in {industry_company}?, please be concise.",
96
+ "Challenges": f"What are in the challenges in the {niche} business for {industry_company}?, please be concise.",
97
+ }
98
+
99
+ # Fetch the results for each category
100
+ results = {}
101
+ async with aiohttp.ClientSession() as session:
102
+ tasks = [web_search(session, question) for question in questions.values()]
103
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
104
+
105
+ for type_, response in zip(questions, responses):
106
+ if isinstance(response, Exception):
107
+ results[type_] = {"error": str(response)}
108
+ else:
109
+ results[type_] = response
110
+
111
+ return results
112
+
113
+
114
+ async def answer(client, context: str, SysPrompt: str):
115
+
116
+ message = f"CONTEXT:\n\n{context}"
117
+ model = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
118
+ messages = [
119
+ {"role": "system", "content": SysPrompt},
120
+ {"role": "user", "content": message},
121
+ ]
122
+ print("herere")
123
+ response = await client.chat.completions.create(
124
+ messages=messages, model=model, temperature=0
125
+ )
126
+ print("nononon")
127
+ source = response.choices[0].message.content
128
+ return source
129
+
130
+
131
+ async def business_information(collection_id):
132
+ async_client = AsyncClient(
133
+ api_key=TOGETHER_API, base_url="https://api.together.xyz/v1"
134
+ )
135
+ keys = ["product-and-market", "team-and-strategy", "financials"]
136
+
137
+ async with async_client as aclient:
138
+ tasks = []
139
+ for i_key in keys:
140
+ for j_key in PROMPTS[i_key]:
141
+ embedding = HyDE[i_key][j_key]
142
+ sys_prompt = PROMPTS[i_key][j_key]
143
+ chunks = client.retrieve_chunks(
144
+ collection_id=collection_id, query_embedding=embedding
145
+ )
146
+ context = "\n\n".join(chunks)
147
+ tasks.append(
148
+ asyncio.create_task(
149
+ answer(client=aclient, context=context, SysPrompt=sys_prompt)
150
+ )
151
+ )
152
+ await asyncio.sleep(1)
153
+
154
+ responses = await asyncio.gather(*tasks)
155
+ response_dict = {}
156
+ for i_count, i_key in enumerate(keys):
157
+ response_dict[i_key] = {}
158
+ for j_count, j_key in enumerate(PROMPTS[i_key]):
159
+ response_dict[i_key][j_key] = responses[i_count * 4 + j_count]
160
+
161
+ return response_dict
162
+
163
+
164
+ def response(
165
+ message: object,
166
+ model: object = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
167
+ SysPrompt: object = PROMPTS["default"],
168
+ temperature: object = 0.2,
169
+ ) -> str:
170
+ """
171
+ :rtype: object
172
+ """
173
+ client = OpenAI(api_key=TOGETHER_API, base_url="https://api.together.xyz/v1")
174
+
175
+ messages = [
176
+ {"role": "system", "content": SysPrompt},
177
+ {"role": "user", "content": message},
178
+ ]
179
+
180
+ @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
181
+ def completion_with_backoff(**kwargs):
182
+ print("RETRY")
183
+ return client.chat.completions.create(**kwargs)
184
+
185
+ try:
186
+ response = completion_with_backoff(
187
+ model=model,
188
+ messages=messages,
189
+ temperature=temperature,
190
+ frequency_penalty=0.2,
191
+ )
192
+ return str(response.choices[0].message.content)
193
+ except Exception as e:
194
+ print(f"An error occurred: {e}")
195
+ return "NONE"
utils/PdfUtils.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy
2
+ from PIL import Image
3
+ from typing import List, Tuple
4
+
5
+ import pymupdf
6
+
7
+ from .ChartClassifier import Classifier
8
+ from .HelperFunctions import CountTokens
9
+ from .ModelCallingFunctions import image_data_extractor
10
+
11
+
12
+ def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
13
+ "Takes image path and extract information from it, and return it as text."
14
+
15
+ # Start Classifier inference session
16
+ classifier = Classifier("utils/graph_classifierV2_B.onnx")
17
+
18
+ img_list = []
19
+
20
+ for pixmap in pixmap_list:
21
+ try:
22
+ img_list.append(
23
+ Image.frombytes(
24
+ mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
25
+ )
26
+ )
27
+ except Exception as e:
28
+ print(e)
29
+
30
+ graph_image = classifier.classify(img_list)
31
+ print(graph_image)
32
+
33
+ response_list = []
34
+
35
+ for idx, is_graph in enumerate(graph_image):
36
+ if is_graph:
37
+ response = image_data_extractor(img=img_list[idx], text=text)
38
+ response_list.append(str(response))
39
+
40
+ return response_list
41
+
42
+
43
+ def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]:
44
+ """
45
+ Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
46
+ and page number containing that text.
47
+ """
48
+ print("Extract content called ")
49
+ pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")
50
+
51
+ pages_content = []
52
+ refered_xref = []
53
+ for page_number in range(pdf_doc.page_count):
54
+ page_content = ""
55
+
56
+ # extracting text content
57
+ page = pdf_doc.load_page(page_number)
58
+ text_content = str(page.get_text()).replace("\n", "\t")
59
+ page_content += text_content
60
+
61
+ # extracting image content
62
+ image_list = page.get_image_info(xrefs=True)
63
+ pixmap_list = []
64
+ for img_info in image_list:
65
+ xref = img_info["xref"]
66
+ if xref not in refered_xref:
67
+ # if xref not in refered_xref:
68
+ try:
69
+ img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
70
+ pixmap_list.append(img_pixmap)
71
+ refered_xref.append(xref)
72
+ except ValueError as e:
73
+ print(f"Skipping image with due to error: {e}")
74
+ if len(pixmap_list) > 0:
75
+ img_content = extract_image_content(
76
+ pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
77
+ )
78
+ page_content = page_content + "\n\n" + "\n\n".join(img_content)
79
+
80
+ pages_content.append(page_content)
81
+
82
+ num_tokens = CountTokens(pages_content)
83
+
84
+ final_data = []
85
+
86
+ # Logic to handle case when page content > 512 tokens
87
+ for e, n_token in enumerate(num_tokens):
88
+ if n_token > 500:
89
+ n_parts = numpy.ceil(n_token / 500).astype(int)
90
+ len_content = len(pages_content[e])
91
+ part_size = len_content // n_parts
92
+ start, end = 0, part_size
93
+ temp = []
94
+ for _ in range(n_parts):
95
+ temp.append((pages_content[e][start:end], e + 1))
96
+ start = end
97
+ end = end + part_size
98
+ final_data += temp
99
+ else:
100
+ final_data.append((pages_content[e], e + 1))
101
+
102
+ pdf_doc.close()
103
+ return final_data
utils/VectorDatabase.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Contain Wrapper Class for ChormaDB client, that can process and store documents and retrive document chunks.
3
+ """
4
+
5
+ # for chromaDB
6
+ __import__("pysqlite3")
7
+ import sys
8
+
9
+ sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
10
+
11
+ from typing import List, Optional, Tuple
12
+ import chromadb
13
+
14
+
15
+ class AdvancedClient:
16
+
17
+ def __init__(self, vector_database_path: str = "vectorDB") -> None:
18
+ self.client = chromadb.PersistentClient(path=vector_database_path)
19
+
20
+ def create_collection(
21
+ self,
22
+ collection_id: str,
23
+ file_datas: List[Tuple[str, int]],
24
+ ):
25
+ chunks = []
26
+ ids = []
27
+
28
+ for chunk, _id in file_datas:
29
+ chunks.append(chunk)
30
+ ids.append(str(_id))
31
+
32
+ from .ModelCallingFunctions import generate_embedding
33
+
34
+ embeddings = generate_embedding(texts=chunks)
35
+
36
+ collection = self.client.create_collection(collection_id)
37
+ collection.add(
38
+ ids=ids,
39
+ embeddings=embeddings, # type: ignore
40
+ documents=chunks,
41
+ )
42
+
43
+ def retrieve_chunks(
44
+ self,
45
+ collection_id: str,
46
+ query: str = "NONE",
47
+ query_embedding: Optional[List[float]] = None,
48
+ number_of_chunks: int = 3,
49
+ ):
50
+
51
+ collection = self.client.get_collection(name=collection_id)
52
+
53
+ if query_embedding == None:
54
+ from .ModelCallingFunctions import generate_embedding
55
+
56
+ query_emb = generate_embedding([query])[0]
57
+ else:
58
+ query_emb = query_embedding
59
+
60
+ results = collection.query(
61
+ query_embeddings=query_emb,
62
+ n_results=number_of_chunks,
63
+ )
64
+
65
+ return results["documents"][0] # pyright: ignore
graph_classifierV2_B.onnx → utils/graph_classifierV2_B.onnx RENAMED
File without changes
utils/prompts.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROMPTS = {
2
+ "gemini-image": "You are provided with the images extracted from a pitch-deck and some text surrounding the image from the same pitch deck. Extract all the factual information that the image is trying to communicate through line charts, area line charts, bar charts, pie charts, tables exectra. Use OCR to extract numerical figures and include them in the information. If the image does not have any information like its a blank image or image of a person then response should be NOTHING. Do not add any additional comments or markdown, just give information. \n\n SURROUNDING TEXT \n\n{text}",
3
+ "industry-finder": """You are a business strategy consultant. You have been identifying niche markets and industries for companies across various sectors for over 20 years. Your expertise lies in analyzing detailed CONTEXT to accurately pinpoint the niche and industry of a business.
4
+ Objective: Identify the niche and industry of a business by analyzing the provided CONTEXT.
5
+ Steps to follow:
6
+ Read the context: Carefully read the provided information to understand the business's products, services, target audience, and unique value propositions.
7
+ Determine the industry: Based on the provided CONTEXT, identify the primary industry to which the business belongs. Consider factors such as the type of products/services offered, the market served, and industry-specific terminology.
8
+ Identify the niche: Analyze the details to pinpoint the specific niche within the industry. Look for unique aspects of the business, specialized market segments, or specific customer needs that the business addresses.
9
+ Provide output in JSON format: Clearly state the identified industry and niche in a JSON format. Ensure your reasoning supports the identified industry and niche.The output should JSON ,Do not add any additional format.
10
+ Output format:
11
+ {
12
+ "industry": "Identified industry here",
13
+ "niche": "Identified niche here",
14
+ }
15
+ Take a deep breath and work on this problem step-by-step.""",
16
+ "default": "You are now in the role of an expert AI.",
17
+ "product-and-market": {
18
+ "product-service-overview": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, analyze the product or services offered by the business. Provide 5 to 10 factual bullet points, with a strong emphasis on the CONTEXT. If the information provided is insufficient or unclear, respond with 'Not enough information to provide a complete analysis.""",
19
+ "target-customer-problem-solved": """You are an experienced investor evaluating a pitch. Based on the provided CONTEXT, identify the target customers for the business and describe the problem the product or service is solving. Provide 5 to 10 factual bullet points focusing on customer pain points, market size, and the urgency of the problem. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
20
+ "revenue-and-pricing": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, analyze the revenue streams and pricing strategy of the business. Provide 5 to 10 factual bullet points, focusing on how the business generates revenue, its pricing models, and any monetization strategies. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
21
+ "market-growth-opportunity": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the market size, potential growth opportunities, and the business’s positioning within the industry. Provide 5 to 10 factual bullet points, focusing on market trends, scalability, and any competitive advantages. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
22
+ },
23
+ "team-and-strategy": {
24
+ "competitive-differentiation": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, assess the business’s competitive differentiation. Provide 5 to 10 factual bullet points, focusing on how the company’s product, service, or strategy stands out from competitors, including unique features, intellectual property, or market positioning. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
25
+ "partnerships-distribution": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the business’s partnerships and distribution strategy. Provide 5 to 10 factual bullet points, focusing on key partnerships, distribution channels, and how these contribute to the company’s growth and market reach. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis""",
26
+ "sales-marketing": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, assess the business’s sales and marketing strategy. Provide 5 to 10 factual bullet points, focusing on the sales channels, marketing tactics, customer acquisition strategies, and how the business plans to scale its efforts. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
27
+ "key-members": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the founder and key team members of the business. Provide 5 to 10 factual bullet points, focusing on their backgrounds, relevant experience, and roles in executing the business vision. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
28
+ },
29
+ "financials": {
30
+ "financial-peformance": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the company’s financial performance. Provide 5 to 10 factual bullet points, focusing on revenue, profitability, cash flow, and key financial metrics. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
31
+ "key-metrics": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, analyze the key financial metrics of the company. Provide 5 to 10 factual bullet points, focusing on important indicators such as gross margin, EBITDA, net profit, customer acquisition cost (CAC), lifetime value (LTV), and other relevant metrics. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
32
+ "cost-drivers": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, assess the company’s cost drivers. Provide 5 to 10 factual bullet points, focusing on the main expenses such as production costs, labor, marketing, technology, or other operational expenses. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
33
+ "cash-projections": """You are an experienced investor reviewing a pitch. Based on the provided CONTEXT, evaluate the company’s cash flow projections. Provide 5 to 10 factual bullet points, focusing on projected cash inflows and outflows, sustainability of operations, and how the company plans to manage liquidity over time. If the information provided is insufficient, respond with 'Not enough information to provide a complete analysis.""",
34
+ },
35
+ }
36
+ HyDE_text = {
37
+ "product-and-market": {
38
+ "product-service-overview": """Imagine a company offering a unique product or service. The product is designed to solve a specific market problem and stands out due to its innovative features or superior quality. It addresses customer pain points and has the potential to scale in the industry. Now, based on the CONTEXT provided, retrieve documents that describe the products or services of the business at hand, highlighting their key attributes and competitive advantages.""",
39
+ "target-customer-problem-solved": """Imagine a business that has identified a specific customer segment facing a significant problem. The company has designed a product or service that addresses this problem, providing a clear solution. The target customers experience pain points that affect their daily lives or business operations. Now, based on the CONTEXT provided, retrieve documents that detail the target customers and the problem the product or service is solving.""",
40
+ "revenue-and-pricing": """Imagine a business with multiple revenue streams and a well-thought-out pricing strategy designed to maximize profitability. The company may offer subscription models, one-time purchases, or tiered pricing based on customer needs. The pricing reflects the value delivered to customers and the competitive landscape. Now, based on the CONTEXT provided, retrieve documents that explain the revenue streams and pricing strategy of the business.""",
41
+ "market-growth-opportunity": """Imagine a company operating in a high-potential market with significant growth opportunities. The industry is expanding due to emerging trends, customer demand, or technological advancements. The company has identified key opportunities for growth, such as expanding to new markets, increasing customer segments, or leveraging competitive advantages. Now, based on the CONTEXT provided, retrieve documents that discuss the market size and growth opportunities for the business.""",
42
+ },
43
+ "team-and-strategy": {
44
+ "competitive-differentiation": """Imagine a company that distinguishes itself from competitors through innovative products, unique services, or a strong market position. The company may have proprietary technology, a superior user experience, or a first-mover advantage. Now, based on the CONTEXT provided, retrieve documents that explain how the business differentiates itself from competitors and maintains a competitive edge""",
45
+ "partnerships-distribution": """Imagine a company that has established strategic partnerships and a robust distribution network to enhance its market reach. The partnerships may involve suppliers, distributors, or technology partners, while the distribution strategy ensures the product or service is accessible to the target market through various channels. Now, based on the CONTEXT provided, retrieve documents that discuss the business’s key partnerships and distribution strategies.""",
46
+ "sales-marketing": """Imagine a business with a well-defined sales and marketing strategy aimed at acquiring and retaining customers. The company uses various sales channels such as direct sales, online platforms, or partnerships, and employs marketing tactics like digital marketing, social media, or paid advertising to reach its audience. Now, based on the CONTEXT provided, retrieve documents that detail the business’s sales channels, marketing strategies, and customer acquisition plans.""",
47
+ "key-members": """Imagine a business led by a visionary founder with a team of skilled professionals, each playing a critical role in driving the company forward. The founder brings relevant industry experience, and the key team members have complementary skills and leadership in areas like marketing, operations, or technology. Now, based on the CONTEXT provided, retrieve documents that describe the backgrounds, qualifications, and roles of the founder and key team members.""",
48
+ },
49
+ "financials": {
50
+ "financial-peformance": """Imagine a company with a track record of financial performance, showing key metrics like revenue growth, profitability, and cash flow management. The company’s financial statements reflect its financial health, including historical performance and trends. Now, based on the CONTEXT provided, retrieve documents that detail the company’s financial performance, including revenue, profit margins, and other key financial indicators.""",
51
+ "key-metrics": """Imagine a company presenting its key financial metrics to demonstrate its financial health and efficiency. Metrics like gross margin, EBITDA, net profit, and customer acquisition costs reflect the company’s profitability and operational efficiency. Now, based on the CONTEXT provided, retrieve documents that highlight the company's key financial metrics, including profitability, operational efficiency, and customer-related metrics.""",
52
+ "cost-drivers": """Imagine a company identifying its main cost drivers that impact profitability. These might include production costs, labor expenses, marketing, technology investments, or operational overhead. Now, based on the CONTEXT provided, retrieve documents that explain the company’s major cost drivers and how these expenses affect its overall financial performance.""",
53
+ "cash-projections": """Imagine a company presenting its cash flow projections, showing expected cash inflows and outflows over the coming quarters or years. The projections reflect the company’s ability to sustain operations, grow, and manage liquidity. Now, based on the CONTEXT provided, retrieve documents that explain the company’s cash flow projections, including key assumptions and expected financial outcomes.""",
54
+ },
55
+ }