Spaces:
Sleeping
Sleeping
| import tiktoken | |
| import os | |
| from bs4 import BeautifulSoup | |
| import gradio as gr | |
| from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate | |
| from langchain.memory import ConversationBufferWindowMemory | |
| import openai | |
| import requests | |
| from langchain.chat_models import ChatOpenAI | |
| import ast | |
| import re | |
| import json | |
| import tempfile | |
| import collections | |
| import graphviz | |
| OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] | |
| def save_webpage_as_html(url): | |
| headers = { | |
| 'authority': 'ms-mt--api-web.spain.advgo.net', | |
| 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', | |
| 'accept': 'application/json, text/plain, */*', | |
| 'x-adevinta-channel': 'web-desktop', | |
| 'x-schibsted-tenant': 'coches', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', | |
| 'content-type': 'application/json;charset=UTF-8', | |
| 'origin': 'https://www.coches.net', | |
| 'sec-fetch-site': 'cross-site', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-dest': 'empty', | |
| 'referer': 'https://www.coches.net/', | |
| 'accept-language': 'en-US,en;q=0.9,es;q=0.8', | |
| } | |
| response = requests.get(url, headers=headers) | |
| # Check if the request was successful | |
| if response.status_code != 200: | |
| print(f"Failed to get the webpage: {url}") | |
| return | |
| # Create a BeautifulSoup object and specify the parser | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Create a dictionary to hold the result | |
| result = collections.defaultdict(list) | |
| # Find all tags that contain text (you may need to add more tags to this list) | |
| for tag in soup.find_all(['li', 'ol']): | |
| result[tag.name].append(tag.get_text(strip=True)) | |
| return result | |
| output_json_format = ''' | |
| { | |
| "category": "root_category", | |
| "subcategories": [ | |
| { | |
| "category": "node_category", | |
| "subcategories": [ | |
| { | |
| "category": "node_category", | |
| "subcategories": [category1, category2, ...] | |
| }, | |
| { | |
| "category": "node_category", | |
| "subcategories": [category1, category2, ...] | |
| } | |
| ] | |
| }, | |
| { | |
| "category": "node_category", | |
| "subcategories": [category1, category2, ...] | |
| } | |
| ] | |
| } | |
| ''' | |
| empty_json = { | |
| "category": "root_category", | |
| "subcategories": [ | |
| ] | |
| } | |
| def get_taxanomy_from_url(url): | |
| url_dict = save_webpage_as_html(url) | |
| json_input = str(url_dict) | |
| template = ''' | |
| {history} | |
| {human_input} | |
| ''' | |
| prompt = PromptTemplate( | |
| input_variables=["history", "human_input"], | |
| template=template | |
| ) | |
| chatgpt_chain = LLMChain( | |
| llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY), | |
| prompt=prompt, | |
| verbose=True, | |
| memory=ConversationBufferWindowMemory(k=10), | |
| ) | |
| prompt_input2 = f''' | |
| You are an expert ecommerce product taxanomy analyst. | |
| You are equiped with vast knowledge of taxanomy, ontology and everything related to it. | |
| You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships; | |
| it describes content and relationships. | |
| A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each; | |
| it prescribes structure and terminology." | |
| You have a task to extract taxanomy from a python dictionary of an extracted html page of an ecommerce website. | |
| Here is the input python dictionary: | |
| {json_input} | |
| Here is the output json format: | |
| {output_json_format} | |
| From the input python dictionary, extract all available products under the li and ol key and create the output json taxanomy. | |
| Think step by step. | |
| Place the products in categories and subcategories accordingly. | |
| Organize all the products to fit the output json format. | |
| The output should follow a python dictionary.. | |
| Do not declare a new variable, output the python dictionary json object only. | |
| Do not output "The taxonomy extracted from the given python list can be represented as follows:" | |
| Do not provide extra information. Directly output the python dictionary only. | |
| Do not insert any string before or after the python dictionary. | |
| Do not say "Here is" | |
| Do not say "As an AI model" | |
| Do not explain anything about the python dictionary. | |
| Output python dictionary only. ONLY python dictionary as output. | |
| ''' | |
| encoding = tiktoken.encoding_for_model("gpt-4") | |
| encoded_prompt2 = encoding.encode(prompt_input2)[:8000] | |
| prompt_input2 = encoding.decode(encoded_prompt2) | |
| json_dict = "" | |
| while type(json_dict) != dict: | |
| json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input2) | |
| json_dict = ast.literal_eval(json_taxanomy_output) | |
| file_name = "url_temp.json" | |
| # Save the modified data back to the file | |
| with open(file_name, 'w') as json_file: | |
| json.dump(json_dict, json_file, indent=4) # 'indent' parameter makes the output more readable | |
| return(file_name) | |
| def expand_taxanomy(json_dict, num_layers, num_items, category_type): | |
| num_layers = str(int(num_layers)) | |
| num_items = str(int(num_items)) | |
| json_input = str(json_dict) | |
| template = ''' | |
| {history} | |
| {human_input} | |
| ''' | |
| prompt = PromptTemplate( | |
| input_variables=["history", "human_input"], | |
| template=template | |
| ) | |
| chatgpt_chain = LLMChain( | |
| llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY), | |
| prompt=prompt, | |
| verbose=True, | |
| memory=ConversationBufferWindowMemory(k=10), | |
| ) | |
| prompt_input1 = f''' | |
| You are an expert ecommerce product taxanomy analyst. | |
| You are equiped with vast knowledge of taxanomy, ontology and everything related to it. | |
| You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships; | |
| it describes content and relationships. | |
| A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each; | |
| it prescribes structure and terminology." | |
| You have a task to expand a taxanomy that is formatted in a json file. | |
| The taxanomy tree should be {num_layers} layer deep with a total of {num_items} items. | |
| The category type is {category_type}. | |
| Here is the input json file: | |
| {json_input} | |
| Here is the output json format: | |
| {output_json_format} | |
| Expand the taxanomy of the input json file. | |
| Find subcategories that fits each category. | |
| Expand the leafs of the taxanomy tree. | |
| Go deeper. Think step by step. | |
| Find all subcategories and output it as a json object. | |
| The output should follow a python dictionary. | |
| Do not say "Here is" | |
| Do not declare a new variable, output the python dictionary json object only. | |
| Do not provide extra information. Directly output the python dictionary only. | |
| ''' | |
| encoding = tiktoken.encoding_for_model("gpt-4") | |
| encoded_prompt1 = encoding.encode(prompt_input1)[:8000] | |
| prompt_input1 = encoding.decode(encoded_prompt1) | |
| json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input1) | |
| json_dict = ast.literal_eval(json_taxanomy_output) | |
| return(json_dict) | |
| def add_nodes_edges(graph, data, parent=None): | |
| new_name = data['category'] | |
| # create node | |
| graph.node(new_name) | |
| if parent: | |
| # create an edge between parent and child | |
| graph.edge(parent, new_name) | |
| # iterate over subcategories (if they exist) | |
| for subcat in data.get('subcategories', []): | |
| # subcategories can be either strings or new dicts | |
| if isinstance(subcat, str): | |
| # create node for the string subcategory | |
| graph.node(subcat) | |
| # create edge between the parent category and this subcategory | |
| graph.edge(new_name, subcat) | |
| else: | |
| # if subcat is a dict, repeat the process with subcat as the parent | |
| add_nodes_edges(graph, subcat, new_name) | |
| def visualize_json(data): | |
| graph = graphviz.Digraph(graph_attr={'rankdir': 'LR'}) # Added 'LR' for left to right graph | |
| # Add nodes and edges | |
| add_nodes_edges(graph, data) | |
| # Visualize the graph | |
| #graph.view() | |
| return graph | |
| def get_file(json_file): | |
| try: | |
| print("loading json file") | |
| print("temp_file", json_file.name) | |
| file_path = json_file.name | |
| with open(file_path, 'r') as json_file: | |
| data = json.load(json_file) | |
| except: | |
| print("using temp json") | |
| file_path = 'temp.json' | |
| with open(file_path, 'r') as json_file: | |
| data = json.load(json_file) | |
| try: | |
| os.remove('graph.png') | |
| print("graph removed") | |
| except: | |
| print("no existing graph") | |
| graph = visualize_json(data) | |
| # Render the graph as a PNG file | |
| graph.format = 'png' | |
| graph = graph.render(filename='graph', cleanup=True) | |
| return graph | |
| def modify_json(json_input, num_layers, num_items, category_type): | |
| print("json_input first", json_input) | |
| if json_input is not None: | |
| file_path = json_input.name | |
| # Open the file and load the JSON data | |
| with open(file_path, 'r') as json_file: | |
| data = json.load(json_file) | |
| else: | |
| data = empty_json | |
| data["category"] = category_type | |
| # Directly from dictionary | |
| file_path = 'temp.json' | |
| with open(file_path, 'w') as outfile: | |
| json.dump(data, outfile) | |
| json_dict = expand_taxanomy(data, num_layers, num_items,category_type) | |
| print("json_dict", json_dict) | |
| # Save the modified data back to the file | |
| with open(file_path, 'w') as json_file: | |
| json.dump(json_dict, json_file, indent=4) # 'indent' parameter makes the output more readable | |
| return(file_path) | |
| def print_num(a,b): | |
| return(int(a), int(b)) | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # Auto Taxanomy App | |
| Upload a JSON taxanomy file or generate from scratch. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| json_file = gr.File(label="Upload JSON here.") | |
| num_layers = gr.Number(label="Number of layers") | |
| num_items = gr.Number(label="Number of items") | |
| category_type = gr.Text(label="Category type") | |
| modify_btn = gr.Button(value="Generate") | |
| render_btn = gr.Button(value="Render") | |
| #print_btn = gr.Button(value="Print") | |
| with gr.Column(): | |
| input_url = gr.Text(label="Insert URL") | |
| geturl_btn = gr.Button(value="Get JSON Taxanomy") | |
| #url_json_file = gr.File(label="URL JSON file.") | |
| rendered_tree = gr.Image(label="Taxanomy Tree.") | |
| output_file = gr.File(label="Ouput JSON file.") | |
| #print_text = gr.Text(label="Printing") | |
| modify_btn.click(modify_json, inputs=[json_file, num_layers, num_items, category_type], outputs=output_file) | |
| render_btn.click(get_file, inputs=json_file, outputs=rendered_tree) | |
| #print_btn.click(print_num, inputs=[num_layers,num_items], outputs=print_text) | |
| geturl_btn.click(get_taxanomy_from_url, inputs=input_url, outputs=output_file) | |
| demo.launch() |