startupCaptain / app.py
sainathBelagavi's picture
Update app.py
896fecd verified
import streamlit as st
from huggingface_hub import InferenceClient
import wikipedia
import re
import requests
from bs4 import BeautifulSoup
import os
import pickle
from requests.exceptions import HTTPError
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
import tempfile
base_url = "https://api-inference.huggingface.co/models/"
API_KEY = os.environ.get('HUGGINGFACE_API_KEY')
model_links = {
"StartupCaptain🥇": base_url + "mistralai/Mistral-7B-Instruct-v0.2",
}
model_info = {
"StartupCaptain🥇": {
'description': """The StartupCaptain model is a Large Language Model (LLM) that's able to predict the success potential of Indian startups based on various factors as a Sucess full startup Founder.\n \n\nThis model can analyze startup data, including funding rounds, team experience, industry, market size, user growth, and more to provide insights into the startup's likelihood of success.\n""",
'logo': './captain.jpg'
},
}
def format_prompt(startup_details):
prompt = "[STARTUP_DETAILS]\n"
for key, value in startup_details.items():
if key == "funding_rounds":
prompt += f"{key.capitalize()}:\n"
for round_details in value:
prompt += f"- Type: {round_details.get('type', 'N/A')}, Amount: {round_details.get('amount', 'N/A')}\n"
else:
prompt += f"{key.capitalize()}: {value}\n"
prompt += "[/STARTUP_DETAILS]\n"
prompt += "Fill in any missing details and provide a comprehensive analysis."
return prompt
def reset_conversation():
st.session_state.conversation = []
st.session_state.messages = []
st.session_state.chat_state = "reset"
def load_conversation_history():
history_file = "conversation_history.pickle"
if os.path.exists(history_file):
with open(history_file, "rb") as f:
conversation_history = pickle.load(f)
else:
conversation_history = []
return conversation_history
def save_conversation_history(conversation_history):
history_file = "conversation_history.pickle"
with open(history_file, "wb") as f:
pickle.dump(conversation_history, f)
def extract_details_from_summary(summary):
details = {
"founded_year": "N/A",
"location": "N/A",
"financial_size": "N/A",
"market_size": "N/A",
"success_rate": "N/A"
}
year_match = re.search(r'(\d{4})', summary)
if year_match:
details["founded_year"] = year_match.group(1)
location_match = re.search(r'headquartered in (.*?)[,.]', summary)
if location_match:
details["location"] = location_match.group(1)
return details
def scrape_startup_info(startup_name):
startup_details = {}
try:
startup_summary = wikipedia.summary(startup_name, auto_suggest=False)
startup_details['name'] = startup_name
startup_details['summary'] = startup_summary
extracted_details = extract_details_from_summary(startup_summary)
startup_details.update(extracted_details)
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError, ValueError, HTTPError):
pass
if 'summary' not in startup_details:
try:
crunchbase_url = f"https://www.crunchbase.com/organization/{startup_name.replace(' ', '-')}"
response = requests.get(crunchbase_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
startup_details["name"] = startup_name
founded_year_elem = soup.select_one("div[data-field='founded_year'] span.component--field-formatter")
if founded_year_elem:
startup_details["founded_year"] = int(founded_year_elem.text.strip())
industry_elem = soup.select_one("div[data-field='industries'] span.component--field-formatter")
if industry_elem:
startup_details["industry"] = industry_elem.text.strip()
funding_rounds_elem = soup.select("div[data-field='funding_rounds'] ul li")
funding_rounds = []
for round_elem in funding_rounds_elem:
round_details = {}
round_type = round_elem.select_one("span.component--field-formatter")
if round_type:
round_details["type"] = round_type.text.strip()
round_amount = round_elem.select_one("span.component--field-formatter + span")
if round_amount:
round_details["amount"] = round_amount.text.strip()
funding_rounds.append(round_details)
startup_details["funding_rounds"] = funding_rounds
except Exception as e:
st.error(f"Error scraping Crunchbase: {e}")
try:
angellist_url = f"https://angel.co/company/{startup_name.replace(' ', '-')}"
response = requests.get(angellist_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
team_members_elem = soup.select("div.team-member")
team_members = []
for member_elem in team_members_elem:
member_name = member_elem.select_one("div.name")
if member_name:
team_members.append(member_name.text.strip())
startup_details["team_members"] = team_members
user_growth_elem = soup.select_one("div.profile-content-section div.section-tagline")
if user_growth_elem:
startup_details["user_growth"] = user_growth_elem.text.strip()
except Exception as e:
st.error(f"Error scraping AngelList: {e}")
if 'summary' not in startup_details:
startup_details['summary'] = "N/A"
if 'founded_year' not in startup_details:
startup_details['founded_year'] = "N/A"
if 'industry' not in startup_details:
startup_details['industry'] = "N/A"
if 'funding_rounds' not in startup_details:
startup_details['funding_rounds'] = []
if 'team_members' not in startup_details:
startup_details['team_members'] = "N/A"
if 'user_growth' not in startup_details:
startup_details['user_growth'] = "N/A"
startup_details['financial_size'] = "N/A"
startup_details['market_size'] = "N/A"
startup_details['success_rate'] = "N/A"
startup_details['location'] = "N/A"
return startup_details
def wrap_text(text, width):
words = text.split()
lines = []
current_line = []
for word in words:
if sum(len(w) for w in current_line) + len(current_line) + len(word) > width:
lines.append(' '.join(current_line))
current_line = [word]
else:
current_line.append(word)
if current_line:
lines.append(' '.join(current_line))
return "\n".join(lines)
def generate_pdf_report(startup_details):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
doc = SimpleDocTemplate(tmp_file.name, pagesize=letter)
styles = getSampleStyleSheet()
elements = []
elements.append(Paragraph("Startup Report", styles['Title']))
data = [
["Field", "Details"],
["Startup Name", startup_details['name']],
["Summary", wrap_text(startup_details['summary'], 50)],
["Financial Size", startup_details['financial_size']],
["Market Size", startup_details['market_size']],
["Success Rate", startup_details['success_rate']],
["Founding Year", startup_details['founded_year']],
["Location", startup_details['location']],
["Funding Rounds", ""]
]
for round_details in startup_details['funding_rounds']:
data.append(["", f"- Type: {round_details.get('type', 'N/A')}, Amount: {round_details.get('amount', 'N/A')}"])
data += [
["Team Members", ", ".join(startup_details['team_members']) if isinstance(startup_details['team_members'], list) else startup_details['team_members']],
["User Growth", startup_details['user_growth']]
]
table = Table(data, colWidths=[150, 350])
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 12),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('GRID', (0, 0), (-1, -1), 1, colors.black),
]))
elements.append(table)
doc.build(elements)
return tmp_file.name
models = [key for key in model_links.keys()]
selected_model = st.sidebar.selectbox("Select Model", models)
temp_values = st.sidebar.slider('Select a temperature value', 0.0, 1.0, 0.5)
st.sidebar.button('Reset Chat', on_click=reset_conversation) # Reset button
st.sidebar.write(f"You're now chatting with {selected_model}")
st.sidebar.markdown(model_info[selected_model]['description'])
st.sidebar.image(model_info[selected_model]['logo'])
if "prev_option" not in st.session_state:
st.session_state.prev_option = selected_model
if st.session_state.prev_option != selected_model:
st.session_state.messages = []
st.session_state.prev_option = selected_model
if "chat_state" not in st.session_state:
st.session_state.chat_state = "normal"
if "messages" not in st.session_state:
st.session_state.messages = load_conversation_history()
if "asked_questions" not in st.session_state:
st.session_state.asked_questions = {}
repo_id = model_links[selected_model]
st.subheader(f'{selected_model}')
if st.session_state.chat_state == "normal":
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input(f"Hi I'm {selected_model}, How can I help you today?"):
question_key = prompt.lower().strip()
if "predict success of" in prompt.lower():
if question_key in st.session_state.asked_questions:
response = st.session_state.asked_questions[question_key]
st.markdown(response)
else:
startup_name_match = re.search(r'predict success of (.*?)\?', prompt, re.IGNORECASE)
if startup_name_match:
startup_name = startup_name_match.group(1).strip()
startup_details = scrape_startup_info(startup_name)
if startup_details:
with st.chat_message("user"):
st.markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
conversation_history = [(message["role"], message["content"]) for message in st.session_state.messages]
custom_instruction = f"Based on the provided startup details or information and your knowledge of the industry, and as an experienced startup founder, provide a comprehensive analysis of the startup's potential for success. Discuss the industry outlook, future scope, and any other relevant factors that could contribute to the startup's success or failure. Provide a clear recommendation on whether the startup is likely to be successful or not."
formatted_text = format_prompt(startup_details)
with st.chat_message("assistant"):
client = InferenceClient(model=model_links[selected_model])
max_new_tokens = 3000
try:
output = client.text_generation(
formatted_text,
temperature=temp_values,
max_new_tokens=max_new_tokens,
stream=True
)
response = ""
for output_chunk in output:
if isinstance(output_chunk, dict) and "text" in output_chunk:
response += output_chunk["text"]
else:
response += output_chunk
lines = response.split('\n')
for line in lines:
if 'Financial Size' in line:
startup_details['financial_size'] = line.split(': ')[1]
elif 'Market Size' in line:
startup_details['market_size'] = line.split(': ')[1]
elif 'Success Rate' in line:
startup_details['success_rate'] = line.split(': ')[1]
elif 'Location' in line:
startup_details['location'] = line.split(': ')[1]
elif 'Funding Rounds' in line and len(startup_details['funding_rounds']) == 0:
startup_details['funding_rounds'] = []
for funding_line in lines[lines.index(line) + 1:]:
if '- Type' in funding_line:
round_details = {}
round_details['type'] = re.search(r'Type: (.*?),', funding_line).group(1)
round_details['amount'] = re.search(r'Amount: (.*)', funding_line).group(1)
startup_details['funding_rounds'].append(round_details)
else:
break
st.markdown(f"Success Analysis for {startup_details['name']}\n\n{response}")
st.session_state.asked_questions[question_key] = response
except ValueError as e:
if "Input validation error" in str(e):
st.error("Error: The input prompt is too long. Please try a shorter prompt.")
else:
st.error(f"An error occurred: {e}")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
else:
st.session_state.messages.append({"role": "assistant", "content": response})
save_conversation_history(st.session_state.messages)
else:
st.write(f"No information found for the startup '{startup_name}'. Please try another startup name or provide additional details.")
else:
if question_key in st.session_state.asked_questions:
response = st.session_state.asked_questions[question_key]
st.markdown(response)
else:
with st.chat_message("user"):
st.markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
conversation_history = [(message["role"], message["content"]) for message in st.session_state.messages]
formatted_text = format_prompt({"question": prompt})
with st.chat_message("assistant"):
client = InferenceClient(model=model_links[selected_model])
max_new_tokens = 3000
try:
output = client.text_generation(
formatted_text,
temperature=temp_values,
max_new_tokens=max_new_tokens,
stream=True
)
response = ""
for output_chunk in output:
if isinstance(output_chunk, dict) and "text" in output_chunk:
response += output_chunk["text"]
else:
response += output_chunk
st.markdown(response)
st.session_state.asked_questions[question_key] = response
except ValueError as e:
if "Input validation error" in str(e):
st.error("Error: The input prompt is too long. Please try a shorter prompt.")
else:
st.error(f"An error occurred: {e}")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
else:
st.session_state.messages.append({"role": "assistant", "content": response})
save_conversation_history(st.session_state.messages)
elif st.session_state.chat_state == "reset":
st.session_state.chat_state = "normal"
st.experimental_rerun()
company_name_input = st.text_input('Enter the company name for the report:', key='company_name_input')
if st.button('Download Report', key='download_report_button') and company_name_input:
startup_details = scrape_startup_info(company_name_input)
if startup_details:
formatted_text = format_prompt(startup_details)
client = InferenceClient(model=model_links[selected_model])
max_new_tokens = 3000
try:
output = client.text_generation(
formatted_text,
temperature=temp_values,
max_new_tokens=max_new_tokens,
stream=True
)
response = ""
for output_chunk in output:
if isinstance(output_chunk, dict) and "text" in output_chunk:
response += output_chunk["text"]
else:
response += output_chunk
lines = response.split('\n')
for line in lines:
if 'Financial Size' in line:
startup_details['financial_size'] = line.split(': ')[1]
elif 'Market Size' in line:
startup_details['market_size'] = line.split(': ')[1]
elif 'Success Rate' in line:
startup_details['success_rate'] = line.split(': ')[1]
elif 'Location' in line:
startup_details['location'] = line.split(': ')[1]
elif 'Funding Rounds' in line and len(startup_details['funding_rounds']) == 0:
startup_details['funding_rounds'] = []
for funding_line in lines[lines.index(line) + 1:]:
if '- Type' in funding_line:
round_details = {}
round_details['type'] = re.search(r'Type: (.*?),', funding_line).group(1)
round_details['amount'] = re.search(r'Amount: (.*)', funding_line).group(1)
startup_details['funding_rounds'].append(round_details)
else:
break
pdf_file = generate_pdf_report(startup_details)
with open(pdf_file, "rb") as file:
st.download_button(
label="Download PDF",
data=file,
file_name=f"{company_name_input}_report.pdf",
mime="application/pdf"
)
except ValueError as e:
if "Input validation error" in str(e):
st.error("Error: The input prompt is too long. Please try a shorter prompt.")
else:
st.error(f"An error occurred: {e}")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
else:
st.error(f"No information found for the company '{company_name_input}'.")
company_name_input2 = st.text_input('Enter the company name for the report:', key='company_name_input2')
if st.button('Download Report', key='download_report_button2') and company_name_input2:
startup_details = scrape_startup_info(company_name_input2)
if startup_details:
formatted_text = format_prompt(startup_details)
client = InferenceClient(model=model_links[selected_model])
max_new_tokens = 3000
try:
output = client.text_generation(
formatted_text,
temperature=temp_values,
max_new_tokens=max_new_tokens,
stream=True
)
response = ""
for output_chunk in output:
if isinstance(output_chunk, dict) and "text" in output_chunk:
response += output_chunk["text"]
else:
response += output_chunk
lines = response.split('\n')
for line in lines:
if 'Financial Size' in line:
startup_details['financial_size'] = line.split(': ')[1]
elif 'Market Size' in line:
startup_details['market_size'] = line.split(': ')[1]
elif 'Success Rate' in line:
startup_details['success_rate'] = line.split(': ')[1]
elif 'Location' in line:
startup_details['location'] = line.split(': ')[1]
elif 'Funding Rounds' in line and len(startup_details['funding_rounds']) == 0:
startup_details['funding_rounds'] = []
for funding_line in lines[lines.index(line) + 1:]:
if '- Type' in funding_line:
round_details = {}
round_details['type'] = re.search(r'Type: (.*?),', funding_line).group(1)
round_details['amount'] = re.search(r'Amount: (.*)', funding_line).group(1)
startup_details['funding_rounds'].append(round_details)
else:
break
pdf_file = generate_pdf_report(startup_details)
with open(pdf_file, "rb") as file:
st.download_button(
label="Download PDF",
data=file,
file_name=f"{company_name_input2}_report.pdf",
mime="application/pdf"
)
except ValueError as e:
if "Input validation error" in str(e):
st.error("Error: The input prompt is too long. Please try a shorter prompt.")
else:
st.error(f"An error occurred: {e}")
except Exception as e:
st.error(f"An unexpected error occurred: {e}")
else:
st.error(f"No information found for the company '{company_name_input2}'.")