SoDa12321's picture
Update sumerize.py
6b0d9aa verified
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
import nltk
from nltk.corpus import stopwords
from retrying import retry
from dotenv import load_dotenv
import tiktoken
import os
nltk.download('punkt')
nltk.download('stopwords')
import json
from exa_py import Exa
from groq import Groq
import httpx
# Load environment variables
load_dotenv()
# Initialize APIs
exa = Exa(api_key=os.getenv("EXA_API_KEY"))
#client = Groq(api_key=os.getenv("GROQ_API_KEY"))
utilized_model = "llama3-70b-8192"
token_limit = 6000
model_name = "gpt-3.5-turbo"
try:
# Use a custom HTTP client
http_client = httpx.Client()
client = Groq(api_key=os.getenv("GROQ_API_KEY"), http_client=http_client)
print("Groq client initialized successfully!")
except TypeError as e:
print("Error initializing Groq client:", str(e))
except Exception as ex:
print("Unexpected error:", str(ex))
highlights_options = {
"num_sentences": 7,
"highlights_per_url": 1,
}
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=5)
def call_llm(prompt):
"""Send the prompt to the LLM and return the response."""
search_response = exa.search_and_contents(query=prompt, highlights=highlights_options, num_results=3, use_autoprompt=True)
info = [sr.highlights[0] for sr in search_response.results]
system_prompt = "You are a business proposal assistant. Generate detailed and precise responses to the user's query."
user_prompt = f"Sources: {info}\nQuestion: {prompt}"
completion = client.chat.completions.create(
model=utilized_model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
)
return completion.choices[0].message.content
def summarize_content(content, token_limit):
"""
Summarize the given content using various techniques with tokenization limitations.
Args:
content (str): The text to be summarized.
token_limit (int): Maximum number of tokens allowed in the summary.
Returns:
str: A summary of the content.
"""
# Tokenize the content
parser = PlaintextParser.from_string(content, Tokenizer("english"))
# Get stop words
stop_words = set(stopwords.words('english'))
# Choose the appropriate summarizer based on the technique
summarizers = {
'text_rank': TextRankSummarizer(),
'lex_rank': LexRankSummarizer(),
'lsa': LsaSummarizer(),
'luhn': LuhnSummarizer()
}
best_summary = ""
best_score = 0
# Split content into chunks
chunk_size = 1000 # Adjust based on your needs
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
for chunk in chunks:
parser_chunk = PlaintextParser.from_string(chunk, Tokenizer("english"))
for summarizer_name, summarizer in summarizers.items():
summary = summarizer(parser_chunk.document, 2)
cleaned_summary = str(summary).replace('<html><head></head><body>Y4:0', '').replace('</body></html>', '')
score = len(cleaned_summary.split()) / token_limit
if score > best_score:
best_summary = cleaned_summary
best_score = score
return best_summary
import re
from docx import Document
import streamlit as st
def count_tokens(text, model_name):
# Get the encoding for the specified model
encoding = tiktoken.get_encoding(model_name)
# Encode the text and count tokens
token_count = len(encoding.encode(text))
return token_count
def create_extraction_prompt(content):
"""Create a prompt for extracting specific variables from the summarized content."""
prompt = (
"You are an expert business consultant. Please extract the following information from the provided content:\n\n"
"1. Company Name\n"
"2. Industry\n"
"3. Location\n"
"4. Mission\n"
"5. Vision\n"
"6. Products/Services\n"
"7. Target Market\n"
"8. Value Proposition\n"
"9. Current Revenue\n"
"10. Current Expenses\n"
"11. Funding Requirements\n"
"12. Management Team\n"
"13. Company Structure\n"
"14. Goals/Objectives\n"
"15. Operational Strategy\n"
"16. Market Overview\n"
"17. Promotional Strategy\n"
"18. Current Status\n"
"19. Existing Company Details\n"
"20. Accelerator Participation\n"
"21. Finland Contacts\n"
"22. Reason for Finland Participation\n"
"23. First Year Plan\n"
"24. Funding Plan\n"
"25. Additional Information\n\n"
f"Here is the summarized content:\n{content}\n\n"
"Please provide the extracted information in JSON format, structured as follows:\n"
"{\n"
f" 'company_name': <value>,\n"
f" 'industry': <value>,\n"
f" 'location': <value>,\n"
f" 'mission': <value>,\n"
f" 'vision': <value>,\n"
f" 'products_services': <value>,\n"
f" 'target_market': <value>,\n"
f" 'value_proposition': <value>,\n"
f" 'current_revenue': <value>,\n"
f" 'current_expenses': <value>,\n"
f" 'funding_requirements': <value>,\n"
f" 'management_team': <value>,\n"
f" 'company_structure': <value>,\n"
f" 'goals_objectives': <value>,\n"
f" 'operational_strategy': <value>,\n"
f" 'market_overview': <value>,\n"
f" 'promotional_strategy': <value>,\n"
f" 'current_status': <value>,\n"
f" 'existing_company_details': <value>,\n"
f" 'accelerator_participation': <value>,\n"
f" 'finland_contacts': <value>,\n"
f" 'reason_for_finland': <value>,\n"
f" 'first_year_plan': <value>,\n"
f" 'funding_plan': <value>,\n"
f" 'additional_information': <value>\n}}\n"
)
return prompt
def generate_data_from_docx_1(file):
"""Generate data dictionary from uploaded DOCX file."""
doc = Document(file)
content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
# Summarize content if necessary
content = summarize_content(content, token_limit-1050)
# Create extraction prompt
prompt = create_extraction_prompt(content)
# Call LLM to extract data
extracted_data_json = call_llm(prompt)
# Display extracted data
st.subheader("Data Extraction")
if True: #try:
# Assuming extracted_data_json is a JSON string, parse it into a dictionary
extracted_data = json.loads(extracted_data_json)
st.write(extracted_data)
# You can also extract specific fields if needed
company_name = extracted_data.get("company_name", "Unknown Company")
industry = extracted_data.get("industry", "Unknown Industry")
location = extracted_data.get("location", "Unknown Location")
mission = extracted_data.get("mission", "No mission statement found")
vision = extracted_data.get("vision", "No vision statement found")
products_services = extracted_data.get("products_services", "No products/services description found")
target_market = extracted_data.get("target_market", "No target market description found")
value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found")
current_revenue = extracted_data.get("current_revenue", "$0.00")
current_expenses = extracted_data.get("current_expenses", "$0.00")
funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found")
management_team = extracted_data.get("management_team", "No management team information found")
company_structure = extracted_data.get("company_structure", "No company structure information found")
goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found")
operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found")
market_overview = extracted_data.get("market_overview", "No market overview found")
promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found")
current_status = extracted_data.get("current_status", "Unknown Current Status")
existing_company_details = extracted_data.get("existing_company_details", "No existing company details found")
accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation")
finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found")
reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation")
first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found")
funding_plan = extracted_data.get("funding_plan", "No funding plan information found")
additional_information = extracted_data.get("additional_information", "No additional information found")
return {
"company_name": company_name,
"industry": industry,
"location": location,
"mission": mission,
"vision": vision,
"products_services": products_services,
"target_market": target_market,
"value_proposition": value_proposition,
"current_revenue": current_revenue,
"current_expenses": current_expenses,
"funding_requirements": funding_requirements,
"management_team": management_team,
"company_structure": company_structure,
"goals_objectives": goals_objectives,
"operational_strategy": operational_strategy,
"market_overview": market_overview,
"promotional_strategy": promotional_strategy,
"current_status": current_status,
"existing_company_details": existing_company_details,
"accelerator_participation": accelerator_participation,
"finland_contacts": finland_contacts,
"reason_for_finland": reason_for_finland,
"first_year_plan": first_year_plan,
"funding_plan": funding_plan,
"additional_information": additional_information
}
#except json.JSONDecodeError:
# st.error("Failed to parse JSON response from LLM.")
# return {}
# except Exception as e:
# print(f"Error extracting fields: {str(e)}")
# return {}
import re
from typing import Dict, Any
def generate_data_from_docx(file):
"""Generate data dictionary from uploaded DOCX file."""
doc = Document(file)
content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
content = summarize_content(content, token_limit=650)
prompt = create_extraction_prompt(content)
content = call_llm(f"{prompt}")
st.subheader("data extraction")
st.write(content)
def extract_field(field: str) -> str:
pattern = f"{field}:\\s*(.+)"
match = re.search(pattern, content, re.IGNORECASE)
return match.group(1).strip() if match else f"No {field} information found"
try:
# Extract company details
company_name = extract_field("company_name")
industry = extract_field("industry")
location = extract_field("location")
mission = extract_field("mission")
vision = extract_field("vision")
products_services = extract_field("products_services")
target_market = extract_field("target_market")
value_proposition = extract_field("value_proposition")
current_revenue = extract_field("current_revenue")
current_expenses = extract_field("current_expenses")
funding_requirements = extract_field("funding_requirements")
management_team = extract_field("management_team")
company_structure = extract_field("company_structure")
goals_objectives = extract_field("goals_objectives")
operational_strategy = extract_field("operational_strategy")
market_overview = extract_field("market_overview")
promotional_strategy = extract_field("promotional_strategy")
current_status = extract_field("current_status")
existing_company_details = extract_field("existing_company_details")
accelerator_participation = extract_field("accelerator_participation")
finland_contacts = extract_field("finland_contacts")
reason_for_finland = extract_field("reason_for_finland")
first_year_plan = extract_field("first_year_plan")
funding_plan = extract_field("funding_plan")
additional_information = extract_field("additional_information")
return {
"company_name": company_name,
"industry": industry,
"location": location,
"mission": mission,
"vision": vision,
"products_services": products_services,
"target_market": target_market,
"value_proposition": value_proposition,
"current_revenue": current_revenue,
"current_expenses": current_expenses,
"funding_requirements": funding_requirements,
"management_team": management_team,
"company_structure": company_structure,
"goals_objectives": goals_objectives,
"operational_strategy": operational_strategy,
"market_overview": market_overview,
"promotional_strategy": promotional_strategy,
"current_status": current_status,
"existing_company_details": existing_company_details,
"accelerator_participation": accelerator_participation,
"finland_contacts": finland_contacts,
"reason_for_finland": reason_for_finland,
"first_year_plan": first_year_plan,
"funding_plan": funding_plan,
"additional_information": additional_information
}
except Exception as e:
print(f"Error extracting fields: {str(e)}")
return {}
def generate_data_from_docx_2_1(file):
"""Generate data dictionary from uploaded DOCX file."""
doc = Document(file)
content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
# Summarize content with a token limit
content = summarize_content(content, token_limit=650)
# Create extraction prompt
prompt = create_extraction_prompt(content)
# Call LLM to extract data
extracted_data_json = call_llm(prompt)
st.subheader("Data Extraction")
try:
# Assuming extracted_data_json is a JSON string, parse it into a dictionary
extracted_data = json.loads(extracted_data_json)
# Check if the extracted data is a dictionary
if isinstance(extracted_data, dict):
# Extract specific fields with default values if not found
company_name = extracted_data.get("company_name", "Unknown Company")
industry = extracted_data.get("industry", "Unknown Industry")
location = extracted_data.get("location", "Unknown Location")
mission = extracted_data.get("mission", "No mission statement found")
vision = extracted_data.get("vision", "No vision statement found")
products_services = extracted_data.get("products_services", "No products/services description found")
target_market = extracted_data.get("target_market", "No target market description found")
value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found")
current_revenue = extracted_data.get("current_revenue", "$0.00")
current_expenses = extracted_data.get("current_expenses", "$0.00")
funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found")
management_team = extracted_data.get("management_team", "No management team information found")
company_structure = extracted_data.get("company_structure", "No company structure information found")
goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found")
operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found")
market_overview = extracted_data.get("market_overview", "No market overview found")
promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found")
current_status = extracted_data.get("current_status", "Unknown Current Status")
existing_company_details = extracted_data.get("existing_company_details", "No existing company details found")
accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation")
finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found")
reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation")
first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found")
funding_plan = extracted_data.get("funding_plan", "No funding plan information found")
additional_information = extracted_data.get("additional_information", "No additional information found")
return {
"company_name": company_name,
"industry": industry,
"location": location,
"mission": mission,
"vision": vision,
"products_services": products_services,
"target_market": target_market,
"value_proposition": value_proposition,
"current_revenue": current_revenue,
"current_expenses": current_expenses,
"funding_requirements": funding_requirements,
"management_team": management_team,
"company_structure": company_structure,
"goals_objectives": goals_objectives,
"operational_strategy": operational_strategy,
"market_overview": market_overview,
"promotional_strategy": promotional_strategy,
"current_status": current_status,
"existing_company_details": existing_company_details,
"accelerator_participation": accelerator_participation,
"finland_contacts": finland_contacts,
"reason_for_finland": reason_for_finland,
"first_year_plan": first_year_plan,
"funding_plan": funding_plan,
"additional_information": additional_information
}
else:
st.error("Expected a dictionary for the extracted data.")
return {}
except json.JSONDecodeError:
st.error("Failed to parse JSON response from LLM.")
return {}
except Exception as e:
st.error(f"An error occurred while extracting fields: {str(e)}")
return {}
# Example usage:
# file_path = "path/to/your/docx/file.docx"
# data = generate_data_from_docx_0(file_path)
# print(data)
# Example usage
content = "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of 'intelligent agents': any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals."
# Try different summarization techniques with token limit
print("TextRank Summary (Token Limit: 2049):")
print(summarize_content(content,token_limit))
print("\nLexRank Summary (Token Limit: 2049):")
print(summarize_content(content,token_limit))