|
|
from sumy.parsers.plaintext import PlaintextParser |
|
|
from sumy.nlp.tokenizers import Tokenizer |
|
|
from sumy.summarizers.text_rank import TextRankSummarizer |
|
|
from sumy.summarizers.lex_rank import LexRankSummarizer |
|
|
from sumy.summarizers.lsa import LsaSummarizer |
|
|
from sumy.summarizers.luhn import LuhnSummarizer |
|
|
import nltk |
|
|
from nltk.corpus import stopwords |
|
|
from retrying import retry |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
import tiktoken |
|
|
import os |
|
|
|
|
|
nltk.download('punkt') |
|
|
nltk.download('stopwords') |
|
|
|
|
|
import json |
|
|
from exa_py import Exa |
|
|
from groq import Groq |
|
|
import httpx |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
exa = Exa(api_key=os.getenv("EXA_API_KEY")) |
|
|
|
|
|
utilized_model = "llama3-70b-8192" |
|
|
token_limit = 6000 |
|
|
model_name = "gpt-3.5-turbo" |
|
|
|
|
|
try: |
|
|
|
|
|
http_client = httpx.Client() |
|
|
client = Groq(api_key=os.getenv("GROQ_API_KEY"), http_client=http_client) |
|
|
print("Groq client initialized successfully!") |
|
|
except TypeError as e: |
|
|
print("Error initializing Groq client:", str(e)) |
|
|
except Exception as ex: |
|
|
print("Unexpected error:", str(ex)) |
|
|
|
|
|
highlights_options = { |
|
|
"num_sentences": 7, |
|
|
"highlights_per_url": 1, |
|
|
} |
|
|
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=5) |
|
|
def call_llm(prompt): |
|
|
"""Send the prompt to the LLM and return the response.""" |
|
|
search_response = exa.search_and_contents(query=prompt, highlights=highlights_options, num_results=3, use_autoprompt=True) |
|
|
info = [sr.highlights[0] for sr in search_response.results] |
|
|
|
|
|
system_prompt = "You are a business proposal assistant. Generate detailed and precise responses to the user's query." |
|
|
user_prompt = f"Sources: {info}\nQuestion: {prompt}" |
|
|
|
|
|
completion = client.chat.completions.create( |
|
|
model=utilized_model, |
|
|
messages=[ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": user_prompt}, |
|
|
] |
|
|
) |
|
|
return completion.choices[0].message.content |
|
|
|
|
|
def summarize_content(content, token_limit): |
|
|
""" |
|
|
Summarize the given content using various techniques with tokenization limitations. |
|
|
|
|
|
Args: |
|
|
content (str): The text to be summarized. |
|
|
token_limit (int): Maximum number of tokens allowed in the summary. |
|
|
|
|
|
Returns: |
|
|
str: A summary of the content. |
|
|
""" |
|
|
|
|
|
parser = PlaintextParser.from_string(content, Tokenizer("english")) |
|
|
|
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
|
|
|
|
|
|
|
summarizers = { |
|
|
'text_rank': TextRankSummarizer(), |
|
|
'lex_rank': LexRankSummarizer(), |
|
|
'lsa': LsaSummarizer(), |
|
|
'luhn': LuhnSummarizer() |
|
|
} |
|
|
|
|
|
best_summary = "" |
|
|
best_score = 0 |
|
|
|
|
|
|
|
|
chunk_size = 1000 |
|
|
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] |
|
|
|
|
|
for chunk in chunks: |
|
|
parser_chunk = PlaintextParser.from_string(chunk, Tokenizer("english")) |
|
|
|
|
|
for summarizer_name, summarizer in summarizers.items(): |
|
|
summary = summarizer(parser_chunk.document, 2) |
|
|
|
|
|
cleaned_summary = str(summary).replace('<html><head></head><body>Y4:0', '').replace('</body></html>', '') |
|
|
|
|
|
score = len(cleaned_summary.split()) / token_limit |
|
|
|
|
|
if score > best_score: |
|
|
best_summary = cleaned_summary |
|
|
best_score = score |
|
|
|
|
|
return best_summary |
|
|
import re |
|
|
from docx import Document |
|
|
import streamlit as st |
|
|
|
|
|
def count_tokens(text, model_name): |
|
|
|
|
|
encoding = tiktoken.get_encoding(model_name) |
|
|
|
|
|
|
|
|
token_count = len(encoding.encode(text)) |
|
|
|
|
|
return token_count |
|
|
|
|
|
|
|
|
def create_extraction_prompt(content): |
|
|
"""Create a prompt for extracting specific variables from the summarized content.""" |
|
|
prompt = ( |
|
|
"You are an expert business consultant. Please extract the following information from the provided content:\n\n" |
|
|
"1. Company Name\n" |
|
|
"2. Industry\n" |
|
|
"3. Location\n" |
|
|
"4. Mission\n" |
|
|
"5. Vision\n" |
|
|
"6. Products/Services\n" |
|
|
"7. Target Market\n" |
|
|
"8. Value Proposition\n" |
|
|
"9. Current Revenue\n" |
|
|
"10. Current Expenses\n" |
|
|
"11. Funding Requirements\n" |
|
|
"12. Management Team\n" |
|
|
"13. Company Structure\n" |
|
|
"14. Goals/Objectives\n" |
|
|
"15. Operational Strategy\n" |
|
|
"16. Market Overview\n" |
|
|
"17. Promotional Strategy\n" |
|
|
"18. Current Status\n" |
|
|
"19. Existing Company Details\n" |
|
|
"20. Accelerator Participation\n" |
|
|
"21. Finland Contacts\n" |
|
|
"22. Reason for Finland Participation\n" |
|
|
"23. First Year Plan\n" |
|
|
"24. Funding Plan\n" |
|
|
"25. Additional Information\n\n" |
|
|
|
|
|
f"Here is the summarized content:\n{content}\n\n" |
|
|
|
|
|
"Please provide the extracted information in JSON format, structured as follows:\n" |
|
|
"{\n" |
|
|
f" 'company_name': <value>,\n" |
|
|
f" 'industry': <value>,\n" |
|
|
f" 'location': <value>,\n" |
|
|
f" 'mission': <value>,\n" |
|
|
f" 'vision': <value>,\n" |
|
|
f" 'products_services': <value>,\n" |
|
|
f" 'target_market': <value>,\n" |
|
|
f" 'value_proposition': <value>,\n" |
|
|
f" 'current_revenue': <value>,\n" |
|
|
f" 'current_expenses': <value>,\n" |
|
|
f" 'funding_requirements': <value>,\n" |
|
|
f" 'management_team': <value>,\n" |
|
|
f" 'company_structure': <value>,\n" |
|
|
f" 'goals_objectives': <value>,\n" |
|
|
f" 'operational_strategy': <value>,\n" |
|
|
f" 'market_overview': <value>,\n" |
|
|
f" 'promotional_strategy': <value>,\n" |
|
|
f" 'current_status': <value>,\n" |
|
|
f" 'existing_company_details': <value>,\n" |
|
|
f" 'accelerator_participation': <value>,\n" |
|
|
f" 'finland_contacts': <value>,\n" |
|
|
f" 'reason_for_finland': <value>,\n" |
|
|
f" 'first_year_plan': <value>,\n" |
|
|
f" 'funding_plan': <value>,\n" |
|
|
f" 'additional_information': <value>\n}}\n" |
|
|
|
|
|
) |
|
|
return prompt |
|
|
|
|
|
def generate_data_from_docx_1(file): |
|
|
"""Generate data dictionary from uploaded DOCX file.""" |
|
|
doc = Document(file) |
|
|
content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) |
|
|
|
|
|
|
|
|
content = summarize_content(content, token_limit-1050) |
|
|
|
|
|
|
|
|
prompt = create_extraction_prompt(content) |
|
|
|
|
|
|
|
|
extracted_data_json = call_llm(prompt) |
|
|
|
|
|
|
|
|
st.subheader("Data Extraction") |
|
|
|
|
|
if True: |
|
|
|
|
|
extracted_data = json.loads(extracted_data_json) |
|
|
st.write(extracted_data) |
|
|
|
|
|
|
|
|
company_name = extracted_data.get("company_name", "Unknown Company") |
|
|
industry = extracted_data.get("industry", "Unknown Industry") |
|
|
location = extracted_data.get("location", "Unknown Location") |
|
|
mission = extracted_data.get("mission", "No mission statement found") |
|
|
vision = extracted_data.get("vision", "No vision statement found") |
|
|
products_services = extracted_data.get("products_services", "No products/services description found") |
|
|
target_market = extracted_data.get("target_market", "No target market description found") |
|
|
value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found") |
|
|
current_revenue = extracted_data.get("current_revenue", "$0.00") |
|
|
current_expenses = extracted_data.get("current_expenses", "$0.00") |
|
|
funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found") |
|
|
management_team = extracted_data.get("management_team", "No management team information found") |
|
|
company_structure = extracted_data.get("company_structure", "No company structure information found") |
|
|
goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found") |
|
|
operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found") |
|
|
market_overview = extracted_data.get("market_overview", "No market overview found") |
|
|
promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found") |
|
|
current_status = extracted_data.get("current_status", "Unknown Current Status") |
|
|
existing_company_details = extracted_data.get("existing_company_details", "No existing company details found") |
|
|
accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation") |
|
|
finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found") |
|
|
reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation") |
|
|
first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found") |
|
|
funding_plan = extracted_data.get("funding_plan", "No funding plan information found") |
|
|
additional_information = extracted_data.get("additional_information", "No additional information found") |
|
|
|
|
|
|
|
|
return { |
|
|
"company_name": company_name, |
|
|
"industry": industry, |
|
|
"location": location, |
|
|
"mission": mission, |
|
|
"vision": vision, |
|
|
"products_services": products_services, |
|
|
"target_market": target_market, |
|
|
"value_proposition": value_proposition, |
|
|
"current_revenue": current_revenue, |
|
|
"current_expenses": current_expenses, |
|
|
"funding_requirements": funding_requirements, |
|
|
"management_team": management_team, |
|
|
"company_structure": company_structure, |
|
|
"goals_objectives": goals_objectives, |
|
|
"operational_strategy": operational_strategy, |
|
|
"market_overview": market_overview, |
|
|
"promotional_strategy": promotional_strategy, |
|
|
"current_status": current_status, |
|
|
"existing_company_details": existing_company_details, |
|
|
"accelerator_participation": accelerator_participation, |
|
|
"finland_contacts": finland_contacts, |
|
|
"reason_for_finland": reason_for_finland, |
|
|
"first_year_plan": first_year_plan, |
|
|
"funding_plan": funding_plan, |
|
|
"additional_information": additional_information |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
from typing import Dict, Any |
|
|
|
|
|
def generate_data_from_docx(file): |
|
|
"""Generate data dictionary from uploaded DOCX file.""" |
|
|
doc = Document(file) |
|
|
content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) |
|
|
|
|
|
content = summarize_content(content, token_limit=650) |
|
|
prompt = create_extraction_prompt(content) |
|
|
content = call_llm(f"{prompt}") |
|
|
st.subheader("data extraction") |
|
|
st.write(content) |
|
|
|
|
|
def extract_field(field: str) -> str: |
|
|
pattern = f"{field}:\\s*(.+)" |
|
|
match = re.search(pattern, content, re.IGNORECASE) |
|
|
return match.group(1).strip() if match else f"No {field} information found" |
|
|
|
|
|
try: |
|
|
|
|
|
company_name = extract_field("company_name") |
|
|
industry = extract_field("industry") |
|
|
location = extract_field("location") |
|
|
mission = extract_field("mission") |
|
|
vision = extract_field("vision") |
|
|
products_services = extract_field("products_services") |
|
|
target_market = extract_field("target_market") |
|
|
value_proposition = extract_field("value_proposition") |
|
|
current_revenue = extract_field("current_revenue") |
|
|
current_expenses = extract_field("current_expenses") |
|
|
funding_requirements = extract_field("funding_requirements") |
|
|
management_team = extract_field("management_team") |
|
|
company_structure = extract_field("company_structure") |
|
|
goals_objectives = extract_field("goals_objectives") |
|
|
operational_strategy = extract_field("operational_strategy") |
|
|
market_overview = extract_field("market_overview") |
|
|
promotional_strategy = extract_field("promotional_strategy") |
|
|
current_status = extract_field("current_status") |
|
|
existing_company_details = extract_field("existing_company_details") |
|
|
accelerator_participation = extract_field("accelerator_participation") |
|
|
finland_contacts = extract_field("finland_contacts") |
|
|
reason_for_finland = extract_field("reason_for_finland") |
|
|
first_year_plan = extract_field("first_year_plan") |
|
|
funding_plan = extract_field("funding_plan") |
|
|
additional_information = extract_field("additional_information") |
|
|
|
|
|
return { |
|
|
"company_name": company_name, |
|
|
"industry": industry, |
|
|
"location": location, |
|
|
"mission": mission, |
|
|
"vision": vision, |
|
|
"products_services": products_services, |
|
|
"target_market": target_market, |
|
|
"value_proposition": value_proposition, |
|
|
"current_revenue": current_revenue, |
|
|
"current_expenses": current_expenses, |
|
|
"funding_requirements": funding_requirements, |
|
|
"management_team": management_team, |
|
|
"company_structure": company_structure, |
|
|
"goals_objectives": goals_objectives, |
|
|
"operational_strategy": operational_strategy, |
|
|
"market_overview": market_overview, |
|
|
"promotional_strategy": promotional_strategy, |
|
|
"current_status": current_status, |
|
|
"existing_company_details": existing_company_details, |
|
|
"accelerator_participation": accelerator_participation, |
|
|
"finland_contacts": finland_contacts, |
|
|
"reason_for_finland": reason_for_finland, |
|
|
"first_year_plan": first_year_plan, |
|
|
"funding_plan": funding_plan, |
|
|
"additional_information": additional_information |
|
|
} |
|
|
except Exception as e: |
|
|
print(f"Error extracting fields: {str(e)}") |
|
|
return {} |
|
|
|
|
|
|
|
|
|
|
|
def generate_data_from_docx_2_1(file): |
|
|
"""Generate data dictionary from uploaded DOCX file.""" |
|
|
doc = Document(file) |
|
|
content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) |
|
|
|
|
|
|
|
|
content = summarize_content(content, token_limit=650) |
|
|
|
|
|
|
|
|
prompt = create_extraction_prompt(content) |
|
|
|
|
|
|
|
|
extracted_data_json = call_llm(prompt) |
|
|
|
|
|
st.subheader("Data Extraction") |
|
|
|
|
|
try: |
|
|
|
|
|
extracted_data = json.loads(extracted_data_json) |
|
|
|
|
|
|
|
|
if isinstance(extracted_data, dict): |
|
|
|
|
|
company_name = extracted_data.get("company_name", "Unknown Company") |
|
|
industry = extracted_data.get("industry", "Unknown Industry") |
|
|
location = extracted_data.get("location", "Unknown Location") |
|
|
mission = extracted_data.get("mission", "No mission statement found") |
|
|
vision = extracted_data.get("vision", "No vision statement found") |
|
|
products_services = extracted_data.get("products_services", "No products/services description found") |
|
|
target_market = extracted_data.get("target_market", "No target market description found") |
|
|
value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found") |
|
|
current_revenue = extracted_data.get("current_revenue", "$0.00") |
|
|
current_expenses = extracted_data.get("current_expenses", "$0.00") |
|
|
funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found") |
|
|
management_team = extracted_data.get("management_team", "No management team information found") |
|
|
company_structure = extracted_data.get("company_structure", "No company structure information found") |
|
|
goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found") |
|
|
operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found") |
|
|
market_overview = extracted_data.get("market_overview", "No market overview found") |
|
|
promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found") |
|
|
current_status = extracted_data.get("current_status", "Unknown Current Status") |
|
|
existing_company_details = extracted_data.get("existing_company_details", "No existing company details found") |
|
|
accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation") |
|
|
finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found") |
|
|
reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation") |
|
|
first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found") |
|
|
funding_plan = extracted_data.get("funding_plan", "No funding plan information found") |
|
|
additional_information = extracted_data.get("additional_information", "No additional information found") |
|
|
|
|
|
return { |
|
|
"company_name": company_name, |
|
|
"industry": industry, |
|
|
"location": location, |
|
|
"mission": mission, |
|
|
"vision": vision, |
|
|
"products_services": products_services, |
|
|
"target_market": target_market, |
|
|
"value_proposition": value_proposition, |
|
|
"current_revenue": current_revenue, |
|
|
"current_expenses": current_expenses, |
|
|
"funding_requirements": funding_requirements, |
|
|
"management_team": management_team, |
|
|
"company_structure": company_structure, |
|
|
"goals_objectives": goals_objectives, |
|
|
"operational_strategy": operational_strategy, |
|
|
"market_overview": market_overview, |
|
|
"promotional_strategy": promotional_strategy, |
|
|
"current_status": current_status, |
|
|
"existing_company_details": existing_company_details, |
|
|
"accelerator_participation": accelerator_participation, |
|
|
"finland_contacts": finland_contacts, |
|
|
"reason_for_finland": reason_for_finland, |
|
|
"first_year_plan": first_year_plan, |
|
|
"funding_plan": funding_plan, |
|
|
"additional_information": additional_information |
|
|
} |
|
|
else: |
|
|
st.error("Expected a dictionary for the extracted data.") |
|
|
return {} |
|
|
|
|
|
except json.JSONDecodeError: |
|
|
st.error("Failed to parse JSON response from LLM.") |
|
|
return {} |
|
|
except Exception as e: |
|
|
st.error(f"An error occurred while extracting fields: {str(e)}") |
|
|
return {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
content = "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of 'intelligent agents': any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals." |
|
|
|
|
|
|
|
|
print("TextRank Summary (Token Limit: 2049):") |
|
|
print(summarize_content(content,token_limit)) |
|
|
|
|
|
print("\nLexRank Summary (Token Limit: 2049):") |
|
|
print(summarize_content(content,token_limit)) |
|
|
|
|
|
|