Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import google.generativeai as genai | |
| from langchain.llms import OpenAI | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.agents.agent_types import AgentType | |
| #from langchain_experimental.agents.agent_toolkits import create_csv_agent | |
| from llama_index.llms import OpenAI | |
| from llama_index import VectorStoreIndex, SimpleDirectoryReader | |
| from llama_index.llms import OpenAI | |
| from llama_index import StorageContext, load_index_from_storage | |
| #os.environ["OPENAI_API_KEY"] | |
| # URL of the page to scrape | |
| url = 'https://help.storemate.cloud/docs/' | |
| def get_web_data(valid_links): | |
| for url in valid_links: | |
| # Send a GET request to the URL | |
| response = requests.get(url) | |
| # Parse the page content with BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Find the title and section content | |
| title = soup.find('h1').get_text() | |
| # Find the section with the title "Renew Package Subscription" | |
| section = soup.find('h1').find_next('div') | |
| # Extract the text content from the section | |
| section_text = section.get_text().strip() | |
| section_text = section_text + f"\nmore detail link : {url}" | |
| file = open(f"user_guide/{title}.txt","w") | |
| file.write(f"{title}\n{section_text}") | |
| file.close() | |
| print("data collected") | |
| def get_base_links(): | |
| # Send a GET request to the URL | |
| response = requests.get(url) | |
| # Parse the page content with BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Find all <a> tags with href attributes | |
| links = soup.find_all('a', href=True) | |
| valid_links = [] | |
| # Extract and print all the URLs | |
| for link in links: | |
| if url in str(link): | |
| valid_links.append(link['href']) | |
| print("base links collected") | |
| get_web_data(valid_links) | |
| def update_user_guide(): | |
| get_base_links() | |
| # try: | |
| # storage_context = StorageContext.from_defaults(persist_dir="llama_index") | |
| # index = load_index_from_storage(storage_context=storage_context) | |
| # print("loaded") | |
| # except: | |
| documents = SimpleDirectoryReader("user_guide").load_data() | |
| index = VectorStoreIndex.from_documents(documents) | |
| index.storage_context.persist("llama_index") | |
| print("index created") | |
| return "done" |