Bussiness-plan-17-Question-Answerer-2-test

Build error

App Files Files Community

Bussiness-plan-17-Question-Answerer-2-test / sumerize.py

SoDa12321

Update sumerize.py

6b0d9aa verified about 1 year ago

raw

history blame contribute delete

20.2 kB

	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.text_rank import TextRankSummarizer
	from sumy.summarizers.lex_rank import LexRankSummarizer
	from sumy.summarizers.lsa import LsaSummarizer
	from sumy.summarizers.luhn import LuhnSummarizer
	import nltk
	from nltk.corpus import stopwords
	from retrying import retry
	from dotenv import load_dotenv

	import tiktoken
	import os

	nltk.download('punkt')
	nltk.download('stopwords')

	import json
	from exa_py import Exa
	from groq import Groq
	import httpx

	# Load environment variables
	load_dotenv()

	# Initialize APIs
	exa = Exa(api_key=os.getenv("EXA_API_KEY"))
	#client = Groq(api_key=os.getenv("GROQ_API_KEY"))
	utilized_model = "llama3-70b-8192"
	token_limit = 6000
	model_name = "gpt-3.5-turbo"

	try:
	# Use a custom HTTP client
	http_client = httpx.Client()
	client = Groq(api_key=os.getenv("GROQ_API_KEY"), http_client=http_client)
	print("Groq client initialized successfully!")
	except TypeError as e:
	print("Error initializing Groq client:", str(e))
	except Exception as ex:
	print("Unexpected error:", str(ex))

	highlights_options = {
	"num_sentences": 7,
	"highlights_per_url": 1,
	}

	@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=5)
	def call_llm(prompt):
	"""Send the prompt to the LLM and return the response."""
	search_response = exa.search_and_contents(query=prompt, highlights=highlights_options, num_results=3, use_autoprompt=True)
	info = [sr.highlights[0] for sr in search_response.results]

	system_prompt = "You are a business proposal assistant. Generate detailed and precise responses to the user's query."
	user_prompt = f"Sources: {info}\nQuestion: {prompt}"

	completion = client.chat.completions.create(
	model=utilized_model,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]
	)
	return completion.choices[0].message.content

	def summarize_content(content, token_limit):
	"""
	Summarize the given content using various techniques with tokenization limitations.

	Args:
	content (str): The text to be summarized.
	token_limit (int): Maximum number of tokens allowed in the summary.

	Returns:
	str: A summary of the content.
	"""
	# Tokenize the content
	parser = PlaintextParser.from_string(content, Tokenizer("english"))

	# Get stop words
	stop_words = set(stopwords.words('english'))

	# Choose the appropriate summarizer based on the technique
	summarizers = {
	'text_rank': TextRankSummarizer(),
	'lex_rank': LexRankSummarizer(),
	'lsa': LsaSummarizer(),
	'luhn': LuhnSummarizer()
	}

	best_summary = ""
	best_score = 0

	# Split content into chunks
	chunk_size = 1000 # Adjust based on your needs
	chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

	for chunk in chunks:
	parser_chunk = PlaintextParser.from_string(chunk, Tokenizer("english"))

	for summarizer_name, summarizer in summarizers.items():
	summary = summarizer(parser_chunk.document, 2)

	cleaned_summary = str(summary).replace('<html><head></head><body>Y4:0', '').replace('</body></html>', '')

	score = len(cleaned_summary.split()) / token_limit

	if score > best_score:
	best_summary = cleaned_summary
	best_score = score

	return best_summary
	import re
	from docx import Document
	import streamlit as st

	def count_tokens(text, model_name):
	# Get the encoding for the specified model
	encoding = tiktoken.get_encoding(model_name)

	# Encode the text and count tokens
	token_count = len(encoding.encode(text))

	return token_count


	def create_extraction_prompt(content):
	"""Create a prompt for extracting specific variables from the summarized content."""
	prompt = (
	"You are an expert business consultant. Please extract the following information from the provided content:\n\n"
	"1. Company Name\n"
	"2. Industry\n"
	"3. Location\n"
	"4. Mission\n"
	"5. Vision\n"
	"6. Products/Services\n"
	"7. Target Market\n"
	"8. Value Proposition\n"
	"9. Current Revenue\n"
	"10. Current Expenses\n"
	"11. Funding Requirements\n"
	"12. Management Team\n"
	"13. Company Structure\n"
	"14. Goals/Objectives\n"
	"15. Operational Strategy\n"
	"16. Market Overview\n"
	"17. Promotional Strategy\n"
	"18. Current Status\n"
	"19. Existing Company Details\n"
	"20. Accelerator Participation\n"
	"21. Finland Contacts\n"
	"22. Reason for Finland Participation\n"
	"23. First Year Plan\n"
	"24. Funding Plan\n"
	"25. Additional Information\n\n"

	f"Here is the summarized content:\n{content}\n\n"

	"Please provide the extracted information in JSON format, structured as follows:\n"
	"{\n"
	f" 'company_name': <value>,\n"
	f" 'industry': <value>,\n"
	f" 'location': <value>,\n"
	f" 'mission': <value>,\n"
	f" 'vision': <value>,\n"
	f" 'products_services': <value>,\n"
	f" 'target_market': <value>,\n"
	f" 'value_proposition': <value>,\n"
	f" 'current_revenue': <value>,\n"
	f" 'current_expenses': <value>,\n"
	f" 'funding_requirements': <value>,\n"
	f" 'management_team': <value>,\n"
	f" 'company_structure': <value>,\n"
	f" 'goals_objectives': <value>,\n"
	f" 'operational_strategy': <value>,\n"
	f" 'market_overview': <value>,\n"
	f" 'promotional_strategy': <value>,\n"
	f" 'current_status': <value>,\n"
	f" 'existing_company_details': <value>,\n"
	f" 'accelerator_participation': <value>,\n"
	f" 'finland_contacts': <value>,\n"
	f" 'reason_for_finland': <value>,\n"
	f" 'first_year_plan': <value>,\n"
	f" 'funding_plan': <value>,\n"
	f" 'additional_information': <value>\n}}\n"

	)
	return prompt

	def generate_data_from_docx_1(file):
	"""Generate data dictionary from uploaded DOCX file."""
	doc = Document(file)
	content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

	# Summarize content if necessary
	content = summarize_content(content, token_limit-1050)

	# Create extraction prompt
	prompt = create_extraction_prompt(content)

	# Call LLM to extract data
	extracted_data_json = call_llm(prompt)

	# Display extracted data
	st.subheader("Data Extraction")

	if True: #try:
	# Assuming extracted_data_json is a JSON string, parse it into a dictionary
	extracted_data = json.loads(extracted_data_json)
	st.write(extracted_data)

	# You can also extract specific fields if needed
	company_name = extracted_data.get("company_name", "Unknown Company")
	industry = extracted_data.get("industry", "Unknown Industry")
	location = extracted_data.get("location", "Unknown Location")
	mission = extracted_data.get("mission", "No mission statement found")
	vision = extracted_data.get("vision", "No vision statement found")
	products_services = extracted_data.get("products_services", "No products/services description found")
	target_market = extracted_data.get("target_market", "No target market description found")
	value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found")
	current_revenue = extracted_data.get("current_revenue", "$0.00")
	current_expenses = extracted_data.get("current_expenses", "$0.00")
	funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found")
	management_team = extracted_data.get("management_team", "No management team information found")
	company_structure = extracted_data.get("company_structure", "No company structure information found")
	goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found")
	operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found")
	market_overview = extracted_data.get("market_overview", "No market overview found")
	promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found")
	current_status = extracted_data.get("current_status", "Unknown Current Status")
	existing_company_details = extracted_data.get("existing_company_details", "No existing company details found")
	accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation")
	finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found")
	reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation")
	first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found")
	funding_plan = extracted_data.get("funding_plan", "No funding plan information found")
	additional_information = extracted_data.get("additional_information", "No additional information found")


	return {
	"company_name": company_name,
	"industry": industry,
	"location": location,
	"mission": mission,
	"vision": vision,
	"products_services": products_services,
	"target_market": target_market,
	"value_proposition": value_proposition,
	"current_revenue": current_revenue,
	"current_expenses": current_expenses,
	"funding_requirements": funding_requirements,
	"management_team": management_team,
	"company_structure": company_structure,
	"goals_objectives": goals_objectives,
	"operational_strategy": operational_strategy,
	"market_overview": market_overview,
	"promotional_strategy": promotional_strategy,
	"current_status": current_status,
	"existing_company_details": existing_company_details,
	"accelerator_participation": accelerator_participation,
	"finland_contacts": finland_contacts,
	"reason_for_finland": reason_for_finland,
	"first_year_plan": first_year_plan,
	"funding_plan": funding_plan,
	"additional_information": additional_information
	}
	#except json.JSONDecodeError:
	# st.error("Failed to parse JSON response from LLM.")
	# return {}
	# except Exception as e:
	# print(f"Error extracting fields: {str(e)}")
	# return {}
	import re
	from typing import Dict, Any

	def generate_data_from_docx(file):
	"""Generate data dictionary from uploaded DOCX file."""
	doc = Document(file)
	content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

	content = summarize_content(content, token_limit=650)
	prompt = create_extraction_prompt(content)
	content = call_llm(f"{prompt}")
	st.subheader("data extraction")
	st.write(content)

	def extract_field(field: str) -> str:
	pattern = f"{field}:\\s*(.+)"
	match = re.search(pattern, content, re.IGNORECASE)
	return match.group(1).strip() if match else f"No {field} information found"

	try:
	# Extract company details
	company_name = extract_field("company_name")
	industry = extract_field("industry")
	location = extract_field("location")
	mission = extract_field("mission")
	vision = extract_field("vision")
	products_services = extract_field("products_services")
	target_market = extract_field("target_market")
	value_proposition = extract_field("value_proposition")
	current_revenue = extract_field("current_revenue")
	current_expenses = extract_field("current_expenses")
	funding_requirements = extract_field("funding_requirements")
	management_team = extract_field("management_team")
	company_structure = extract_field("company_structure")
	goals_objectives = extract_field("goals_objectives")
	operational_strategy = extract_field("operational_strategy")
	market_overview = extract_field("market_overview")
	promotional_strategy = extract_field("promotional_strategy")
	current_status = extract_field("current_status")
	existing_company_details = extract_field("existing_company_details")
	accelerator_participation = extract_field("accelerator_participation")
	finland_contacts = extract_field("finland_contacts")
	reason_for_finland = extract_field("reason_for_finland")
	first_year_plan = extract_field("first_year_plan")
	funding_plan = extract_field("funding_plan")
	additional_information = extract_field("additional_information")

	return {
	"company_name": company_name,
	"industry": industry,
	"location": location,
	"mission": mission,
	"vision": vision,
	"products_services": products_services,
	"target_market": target_market,
	"value_proposition": value_proposition,
	"current_revenue": current_revenue,
	"current_expenses": current_expenses,
	"funding_requirements": funding_requirements,
	"management_team": management_team,
	"company_structure": company_structure,
	"goals_objectives": goals_objectives,
	"operational_strategy": operational_strategy,
	"market_overview": market_overview,
	"promotional_strategy": promotional_strategy,
	"current_status": current_status,
	"existing_company_details": existing_company_details,
	"accelerator_participation": accelerator_participation,
	"finland_contacts": finland_contacts,
	"reason_for_finland": reason_for_finland,
	"first_year_plan": first_year_plan,
	"funding_plan": funding_plan,
	"additional_information": additional_information
	}
	except Exception as e:
	print(f"Error extracting fields: {str(e)}")
	return {}



	def generate_data_from_docx_2_1(file):
	"""Generate data dictionary from uploaded DOCX file."""
	doc = Document(file)
	content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

	# Summarize content with a token limit
	content = summarize_content(content, token_limit=650)

	# Create extraction prompt
	prompt = create_extraction_prompt(content)

	# Call LLM to extract data
	extracted_data_json = call_llm(prompt)

	st.subheader("Data Extraction")

	try:
	# Assuming extracted_data_json is a JSON string, parse it into a dictionary
	extracted_data = json.loads(extracted_data_json)

	# Check if the extracted data is a dictionary
	if isinstance(extracted_data, dict):
	# Extract specific fields with default values if not found
	company_name = extracted_data.get("company_name", "Unknown Company")
	industry = extracted_data.get("industry", "Unknown Industry")
	location = extracted_data.get("location", "Unknown Location")
	mission = extracted_data.get("mission", "No mission statement found")
	vision = extracted_data.get("vision", "No vision statement found")
	products_services = extracted_data.get("products_services", "No products/services description found")
	target_market = extracted_data.get("target_market", "No target market description found")
	value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found")
	current_revenue = extracted_data.get("current_revenue", "$0.00")
	current_expenses = extracted_data.get("current_expenses", "$0.00")
	funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found")
	management_team = extracted_data.get("management_team", "No management team information found")
	company_structure = extracted_data.get("company_structure", "No company structure information found")
	goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found")
	operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found")
	market_overview = extracted_data.get("market_overview", "No market overview found")
	promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found")
	current_status = extracted_data.get("current_status", "Unknown Current Status")
	existing_company_details = extracted_data.get("existing_company_details", "No existing company details found")
	accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation")
	finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found")
	reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation")
	first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found")
	funding_plan = extracted_data.get("funding_plan", "No funding plan information found")
	additional_information = extracted_data.get("additional_information", "No additional information found")

	return {
	"company_name": company_name,
	"industry": industry,
	"location": location,
	"mission": mission,
	"vision": vision,
	"products_services": products_services,
	"target_market": target_market,
	"value_proposition": value_proposition,
	"current_revenue": current_revenue,
	"current_expenses": current_expenses,
	"funding_requirements": funding_requirements,
	"management_team": management_team,
	"company_structure": company_structure,
	"goals_objectives": goals_objectives,
	"operational_strategy": operational_strategy,
	"market_overview": market_overview,
	"promotional_strategy": promotional_strategy,
	"current_status": current_status,
	"existing_company_details": existing_company_details,
	"accelerator_participation": accelerator_participation,
	"finland_contacts": finland_contacts,
	"reason_for_finland": reason_for_finland,
	"first_year_plan": first_year_plan,
	"funding_plan": funding_plan,
	"additional_information": additional_information
	}
	else:
	st.error("Expected a dictionary for the extracted data.")
	return {}

	except json.JSONDecodeError:
	st.error("Failed to parse JSON response from LLM.")
	return {}
	except Exception as e:
	st.error(f"An error occurred while extracting fields: {str(e)}")
	return {}
	# Example usage:
	# file_path = "path/to/your/docx/file.docx"
	# data = generate_data_from_docx_0(file_path)
	# print(data)


	# Example usage
	content = "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of 'intelligent agents': any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals."

	# Try different summarization techniques with token limit
	print("TextRank Summary (Token Limit: 2049):")
	print(summarize_content(content,token_limit))

	print("\nLexRank Summary (Token Limit: 2049):")
	print(summarize_content(content,token_limit))