Spaces:
Runtime error
Runtime error
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.text_rank import TextRankSummarizer | |
| from sumy.summarizers.lex_rank import LexRankSummarizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| from sumy.summarizers.luhn import LuhnSummarizer | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from retrying import retry | |
| from dotenv import load_dotenv | |
| import tiktoken | |
| import os | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| import json | |
| from exa_py import Exa | |
| from groq import Groq | |
| import httpx | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize APIs | |
| exa = Exa(api_key=os.getenv("EXA_API_KEY")) | |
| #client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| utilized_model = "llama3-70b-8192" | |
| token_limit = 6000 | |
| model_name = "gpt-3.5-turbo" | |
| try: | |
| # Use a custom HTTP client | |
| http_client = httpx.Client() | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY"), http_client=http_client) | |
| print("Groq client initialized successfully!") | |
| except TypeError as e: | |
| print("Error initializing Groq client:", str(e)) | |
| except Exception as ex: | |
| print("Unexpected error:", str(ex)) | |
| highlights_options = { | |
| "num_sentences": 7, | |
| "highlights_per_url": 1, | |
| } | |
| def call_llm(prompt): | |
| """Send the prompt to the LLM and return the response.""" | |
| search_response = exa.search_and_contents(query=prompt, highlights=highlights_options, num_results=3, use_autoprompt=True) | |
| info = [sr.highlights[0] for sr in search_response.results] | |
| system_prompt = "You are a business proposal assistant. Generate detailed and precise responses to the user's query." | |
| user_prompt = f"Sources: {info}\nQuestion: {prompt}" | |
| completion = client.chat.completions.create( | |
| model=utilized_model, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt}, | |
| ] | |
| ) | |
| return completion.choices[0].message.content | |
| def summarize_content(content, token_limit): | |
| """ | |
| Summarize the given content using various techniques with tokenization limitations. | |
| Args: | |
| content (str): The text to be summarized. | |
| token_limit (int): Maximum number of tokens allowed in the summary. | |
| Returns: | |
| str: A summary of the content. | |
| """ | |
| # Tokenize the content | |
| parser = PlaintextParser.from_string(content, Tokenizer("english")) | |
| # Get stop words | |
| stop_words = set(stopwords.words('english')) | |
| # Choose the appropriate summarizer based on the technique | |
| summarizers = { | |
| 'text_rank': TextRankSummarizer(), | |
| 'lex_rank': LexRankSummarizer(), | |
| 'lsa': LsaSummarizer(), | |
| 'luhn': LuhnSummarizer() | |
| } | |
| best_summary = "" | |
| best_score = 0 | |
| # Split content into chunks | |
| chunk_size = 1000 # Adjust based on your needs | |
| chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] | |
| for chunk in chunks: | |
| parser_chunk = PlaintextParser.from_string(chunk, Tokenizer("english")) | |
| for summarizer_name, summarizer in summarizers.items(): | |
| summary = summarizer(parser_chunk.document, 2) | |
| cleaned_summary = str(summary).replace('<html><head></head><body>Y4:0', '').replace('</body></html>', '') | |
| score = len(cleaned_summary.split()) / token_limit | |
| if score > best_score: | |
| best_summary = cleaned_summary | |
| best_score = score | |
| return best_summary | |
| import re | |
| from docx import Document | |
| import streamlit as st | |
| def count_tokens(text, model_name): | |
| # Get the encoding for the specified model | |
| encoding = tiktoken.get_encoding(model_name) | |
| # Encode the text and count tokens | |
| token_count = len(encoding.encode(text)) | |
| return token_count | |
| def create_extraction_prompt(content): | |
| """Create a prompt for extracting specific variables from the summarized content.""" | |
| prompt = ( | |
| "You are an expert business consultant. Please extract the following information from the provided content:\n\n" | |
| "1. Company Name\n" | |
| "2. Industry\n" | |
| "3. Location\n" | |
| "4. Mission\n" | |
| "5. Vision\n" | |
| "6. Products/Services\n" | |
| "7. Target Market\n" | |
| "8. Value Proposition\n" | |
| "9. Current Revenue\n" | |
| "10. Current Expenses\n" | |
| "11. Funding Requirements\n" | |
| "12. Management Team\n" | |
| "13. Company Structure\n" | |
| "14. Goals/Objectives\n" | |
| "15. Operational Strategy\n" | |
| "16. Market Overview\n" | |
| "17. Promotional Strategy\n" | |
| "18. Current Status\n" | |
| "19. Existing Company Details\n" | |
| "20. Accelerator Participation\n" | |
| "21. Finland Contacts\n" | |
| "22. Reason for Finland Participation\n" | |
| "23. First Year Plan\n" | |
| "24. Funding Plan\n" | |
| "25. Additional Information\n\n" | |
| f"Here is the summarized content:\n{content}\n\n" | |
| "Please provide the extracted information in JSON format, structured as follows:\n" | |
| "{\n" | |
| f" 'company_name': <value>,\n" | |
| f" 'industry': <value>,\n" | |
| f" 'location': <value>,\n" | |
| f" 'mission': <value>,\n" | |
| f" 'vision': <value>,\n" | |
| f" 'products_services': <value>,\n" | |
| f" 'target_market': <value>,\n" | |
| f" 'value_proposition': <value>,\n" | |
| f" 'current_revenue': <value>,\n" | |
| f" 'current_expenses': <value>,\n" | |
| f" 'funding_requirements': <value>,\n" | |
| f" 'management_team': <value>,\n" | |
| f" 'company_structure': <value>,\n" | |
| f" 'goals_objectives': <value>,\n" | |
| f" 'operational_strategy': <value>,\n" | |
| f" 'market_overview': <value>,\n" | |
| f" 'promotional_strategy': <value>,\n" | |
| f" 'current_status': <value>,\n" | |
| f" 'existing_company_details': <value>,\n" | |
| f" 'accelerator_participation': <value>,\n" | |
| f" 'finland_contacts': <value>,\n" | |
| f" 'reason_for_finland': <value>,\n" | |
| f" 'first_year_plan': <value>,\n" | |
| f" 'funding_plan': <value>,\n" | |
| f" 'additional_information': <value>\n}}\n" | |
| ) | |
| return prompt | |
| def generate_data_from_docx_1(file): | |
| """Generate data dictionary from uploaded DOCX file.""" | |
| doc = Document(file) | |
| content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
| # Summarize content if necessary | |
| content = summarize_content(content, token_limit-1050) | |
| # Create extraction prompt | |
| prompt = create_extraction_prompt(content) | |
| # Call LLM to extract data | |
| extracted_data_json = call_llm(prompt) | |
| # Display extracted data | |
| st.subheader("Data Extraction") | |
| if True: #try: | |
| # Assuming extracted_data_json is a JSON string, parse it into a dictionary | |
| extracted_data = json.loads(extracted_data_json) | |
| st.write(extracted_data) | |
| # You can also extract specific fields if needed | |
| company_name = extracted_data.get("company_name", "Unknown Company") | |
| industry = extracted_data.get("industry", "Unknown Industry") | |
| location = extracted_data.get("location", "Unknown Location") | |
| mission = extracted_data.get("mission", "No mission statement found") | |
| vision = extracted_data.get("vision", "No vision statement found") | |
| products_services = extracted_data.get("products_services", "No products/services description found") | |
| target_market = extracted_data.get("target_market", "No target market description found") | |
| value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found") | |
| current_revenue = extracted_data.get("current_revenue", "$0.00") | |
| current_expenses = extracted_data.get("current_expenses", "$0.00") | |
| funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found") | |
| management_team = extracted_data.get("management_team", "No management team information found") | |
| company_structure = extracted_data.get("company_structure", "No company structure information found") | |
| goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found") | |
| operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found") | |
| market_overview = extracted_data.get("market_overview", "No market overview found") | |
| promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found") | |
| current_status = extracted_data.get("current_status", "Unknown Current Status") | |
| existing_company_details = extracted_data.get("existing_company_details", "No existing company details found") | |
| accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation") | |
| finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found") | |
| reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation") | |
| first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found") | |
| funding_plan = extracted_data.get("funding_plan", "No funding plan information found") | |
| additional_information = extracted_data.get("additional_information", "No additional information found") | |
| return { | |
| "company_name": company_name, | |
| "industry": industry, | |
| "location": location, | |
| "mission": mission, | |
| "vision": vision, | |
| "products_services": products_services, | |
| "target_market": target_market, | |
| "value_proposition": value_proposition, | |
| "current_revenue": current_revenue, | |
| "current_expenses": current_expenses, | |
| "funding_requirements": funding_requirements, | |
| "management_team": management_team, | |
| "company_structure": company_structure, | |
| "goals_objectives": goals_objectives, | |
| "operational_strategy": operational_strategy, | |
| "market_overview": market_overview, | |
| "promotional_strategy": promotional_strategy, | |
| "current_status": current_status, | |
| "existing_company_details": existing_company_details, | |
| "accelerator_participation": accelerator_participation, | |
| "finland_contacts": finland_contacts, | |
| "reason_for_finland": reason_for_finland, | |
| "first_year_plan": first_year_plan, | |
| "funding_plan": funding_plan, | |
| "additional_information": additional_information | |
| } | |
| #except json.JSONDecodeError: | |
| # st.error("Failed to parse JSON response from LLM.") | |
| # return {} | |
| # except Exception as e: | |
| # print(f"Error extracting fields: {str(e)}") | |
| # return {} | |
| import re | |
| from typing import Dict, Any | |
| def generate_data_from_docx(file): | |
| """Generate data dictionary from uploaded DOCX file.""" | |
| doc = Document(file) | |
| content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
| content = summarize_content(content, token_limit=650) | |
| prompt = create_extraction_prompt(content) | |
| content = call_llm(f"{prompt}") | |
| st.subheader("data extraction") | |
| st.write(content) | |
| def extract_field(field: str) -> str: | |
| pattern = f"{field}:\\s*(.+)" | |
| match = re.search(pattern, content, re.IGNORECASE) | |
| return match.group(1).strip() if match else f"No {field} information found" | |
| try: | |
| # Extract company details | |
| company_name = extract_field("company_name") | |
| industry = extract_field("industry") | |
| location = extract_field("location") | |
| mission = extract_field("mission") | |
| vision = extract_field("vision") | |
| products_services = extract_field("products_services") | |
| target_market = extract_field("target_market") | |
| value_proposition = extract_field("value_proposition") | |
| current_revenue = extract_field("current_revenue") | |
| current_expenses = extract_field("current_expenses") | |
| funding_requirements = extract_field("funding_requirements") | |
| management_team = extract_field("management_team") | |
| company_structure = extract_field("company_structure") | |
| goals_objectives = extract_field("goals_objectives") | |
| operational_strategy = extract_field("operational_strategy") | |
| market_overview = extract_field("market_overview") | |
| promotional_strategy = extract_field("promotional_strategy") | |
| current_status = extract_field("current_status") | |
| existing_company_details = extract_field("existing_company_details") | |
| accelerator_participation = extract_field("accelerator_participation") | |
| finland_contacts = extract_field("finland_contacts") | |
| reason_for_finland = extract_field("reason_for_finland") | |
| first_year_plan = extract_field("first_year_plan") | |
| funding_plan = extract_field("funding_plan") | |
| additional_information = extract_field("additional_information") | |
| return { | |
| "company_name": company_name, | |
| "industry": industry, | |
| "location": location, | |
| "mission": mission, | |
| "vision": vision, | |
| "products_services": products_services, | |
| "target_market": target_market, | |
| "value_proposition": value_proposition, | |
| "current_revenue": current_revenue, | |
| "current_expenses": current_expenses, | |
| "funding_requirements": funding_requirements, | |
| "management_team": management_team, | |
| "company_structure": company_structure, | |
| "goals_objectives": goals_objectives, | |
| "operational_strategy": operational_strategy, | |
| "market_overview": market_overview, | |
| "promotional_strategy": promotional_strategy, | |
| "current_status": current_status, | |
| "existing_company_details": existing_company_details, | |
| "accelerator_participation": accelerator_participation, | |
| "finland_contacts": finland_contacts, | |
| "reason_for_finland": reason_for_finland, | |
| "first_year_plan": first_year_plan, | |
| "funding_plan": funding_plan, | |
| "additional_information": additional_information | |
| } | |
| except Exception as e: | |
| print(f"Error extracting fields: {str(e)}") | |
| return {} | |
| def generate_data_from_docx_2_1(file): | |
| """Generate data dictionary from uploaded DOCX file.""" | |
| doc = Document(file) | |
| content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
| # Summarize content with a token limit | |
| content = summarize_content(content, token_limit=650) | |
| # Create extraction prompt | |
| prompt = create_extraction_prompt(content) | |
| # Call LLM to extract data | |
| extracted_data_json = call_llm(prompt) | |
| st.subheader("Data Extraction") | |
| try: | |
| # Assuming extracted_data_json is a JSON string, parse it into a dictionary | |
| extracted_data = json.loads(extracted_data_json) | |
| # Check if the extracted data is a dictionary | |
| if isinstance(extracted_data, dict): | |
| # Extract specific fields with default values if not found | |
| company_name = extracted_data.get("company_name", "Unknown Company") | |
| industry = extracted_data.get("industry", "Unknown Industry") | |
| location = extracted_data.get("location", "Unknown Location") | |
| mission = extracted_data.get("mission", "No mission statement found") | |
| vision = extracted_data.get("vision", "No vision statement found") | |
| products_services = extracted_data.get("products_services", "No products/services description found") | |
| target_market = extracted_data.get("target_market", "No target market description found") | |
| value_proposition = extracted_data.get("value_proposition", "No competitive advantage description found") | |
| current_revenue = extracted_data.get("current_revenue", "$0.00") | |
| current_expenses = extracted_data.get("current_expenses", "$0.00") | |
| funding_requirements = extracted_data.get("funding_requirements", "No funding requirements found") | |
| management_team = extracted_data.get("management_team", "No management team information found") | |
| company_structure = extracted_data.get("company_structure", "No company structure information found") | |
| goals_objectives = extracted_data.get("goals_objectives", "No goals and objectives found") | |
| operational_strategy = extracted_data.get("operational_strategy", "No operational strategy found") | |
| market_overview = extracted_data.get("market_overview", "No market overview found") | |
| promotional_strategy = extracted_data.get("promotional_strategy", "No promotional strategy found") | |
| current_status = extracted_data.get("current_status", "Unknown Current Status") | |
| existing_company_details = extracted_data.get("existing_company_details", "No existing company details found") | |
| accelerator_participation = extracted_data.get("accelerator_participation", "Unknown Accelerator Participation") | |
| finland_contacts = extracted_data.get("finland_contacts", "No Finland contacts information found") | |
| reason_for_finland = extracted_data.get("reason_for_finland", "Unknown Reason for Finland Participation") | |
| first_year_plan = extracted_data.get("first_year_plan", "No first year plan information found") | |
| funding_plan = extracted_data.get("funding_plan", "No funding plan information found") | |
| additional_information = extracted_data.get("additional_information", "No additional information found") | |
| return { | |
| "company_name": company_name, | |
| "industry": industry, | |
| "location": location, | |
| "mission": mission, | |
| "vision": vision, | |
| "products_services": products_services, | |
| "target_market": target_market, | |
| "value_proposition": value_proposition, | |
| "current_revenue": current_revenue, | |
| "current_expenses": current_expenses, | |
| "funding_requirements": funding_requirements, | |
| "management_team": management_team, | |
| "company_structure": company_structure, | |
| "goals_objectives": goals_objectives, | |
| "operational_strategy": operational_strategy, | |
| "market_overview": market_overview, | |
| "promotional_strategy": promotional_strategy, | |
| "current_status": current_status, | |
| "existing_company_details": existing_company_details, | |
| "accelerator_participation": accelerator_participation, | |
| "finland_contacts": finland_contacts, | |
| "reason_for_finland": reason_for_finland, | |
| "first_year_plan": first_year_plan, | |
| "funding_plan": funding_plan, | |
| "additional_information": additional_information | |
| } | |
| else: | |
| st.error("Expected a dictionary for the extracted data.") | |
| return {} | |
| except json.JSONDecodeError: | |
| st.error("Failed to parse JSON response from LLM.") | |
| return {} | |
| except Exception as e: | |
| st.error(f"An error occurred while extracting fields: {str(e)}") | |
| return {} | |
| # Example usage: | |
| # file_path = "path/to/your/docx/file.docx" | |
| # data = generate_data_from_docx_0(file_path) | |
| # print(data) | |
| # Example usage | |
| content = "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of 'intelligent agents': any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals." | |
| # Try different summarization techniques with token limit | |
| print("TextRank Summary (Token Limit: 2049):") | |
| print(summarize_content(content,token_limit)) | |
| print("\nLexRank Summary (Token Limit: 2049):") | |
| print(summarize_content(content,token_limit)) | |