Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import os | |
| import shutil | |
| import gradio as gr | |
| import utils_data_extraction | |
| import utils_assessment | |
| import importlib | |
| importlib.reload(utils_data_extraction) | |
| importlib.reload(utils_assessment) | |
| """### Function to load data | |
| Data is loaded from a Roamler Excel file, from a sheet called "output". | |
| - A subset of the Excel file is taken as reference data, and saved in the `outputs` directory as reference_data.csv | |
| - A folder for storing photos is created | |
| A n_rows parameter can be passed to load a subset of the data. | |
| """ | |
| def load_roamler_excel_file(filepath, n_rows=3): | |
| OUTPUT_DIR = 'outputs/'+os.path.basename(filepath) | |
| if not os.path.exists(OUTPUT_DIR): | |
| os.makedirs(OUTPUT_DIR) | |
| DATA_EXTRACTION_DIR=OUTPUT_DIR+'/data_extraction' | |
| if not os.path.exists(DATA_EXTRACTION_DIR): | |
| os.makedirs(DATA_EXTRACTION_DIR) | |
| df_review = pd.read_excel(filepath, sheet_name='Output') | |
| if n_rows is not None: | |
| df_review = df_review.sample(n=n_rows, random_state=42) | |
| df_products = df_review[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo', | |
| 'Brand', 'Product name', 'Legal name', 'Barcode', | |
| 'Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins', 'Salt', 'Ingredients', | |
| 'Nutriscore','Allergens', | |
| 'Quantity per unit']].copy() | |
| df_products.to_csv(f'{OUTPUT_DIR}/data_extraction/reference_data.csv', index=False) | |
| PHOTO_DIR=OUTPUT_DIR+'/photos' | |
| if not os.path.exists(PHOTO_DIR): | |
| os.makedirs(PHOTO_DIR) | |
| df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR) | |
| return df_products, OUTPUT_DIR, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data | |
| def load_df_from_folder(OUTPUT_DIR): | |
| df_brand_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
| if os.path.exists(f'{OUTPUT_DIR}/data_extraction/brand.csv'): | |
| df_brand_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv') | |
| df_product_name_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
| if os.path.exists(f'{OUTPUT_DIR}/data_extraction/product_name.csv'): | |
| df_product_name_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv') | |
| df_ingredients_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
| if os.path.exists(f'{OUTPUT_DIR}/data_extraction/ingredients.csv'): | |
| df_ingredients_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv') | |
| df_nutritional_values_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
| if os.path.exists(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv'): | |
| df_nutritional_values_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv') | |
| return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data | |
| def load_csv_files(archive, OUTPUT_DIR): | |
| accepted_files = ['brand.csv', 'product_name.csv', 'ingredients.csv', 'nutritional_values.csv'] | |
| for file in archive: | |
| print(os.path.basename(file)) | |
| if os.path.basename(file) in accepted_files: | |
| shutil.copy(file, f'{OUTPUT_DIR}/data_extraction') | |
| df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR) | |
| return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data | |
| """### Function to save data | |
| This function is called when the user clicks on the "Generate data archive" button. | |
| It creates a zip of all CSV files of the f'{OUTPUT_DIR}/data_extraction' folder, and return a download button to the archive. | |
| """ | |
| def generate_archive(OUTPUT_DIR): | |
| # Download all data | |
| archive_name = f'{OUTPUT_DIR}' | |
| shutil.make_archive(archive_name, 'zip', f'{OUTPUT_DIR}/data_extraction') | |
| return gr.DownloadButton(label=f"Download {archive_name}.zip", value=f'{archive_name}.zip', visible=True) | |
| """### Gradio UI""" | |
| def toggle_row_visibility(show): | |
| if show: | |
| return gr.update(visible=True) | |
| else: | |
| return gr.update(visible=False) | |
| language = 'French' | |
| # Custom CSS to set max height for the rows | |
| custom_css = """ | |
| .dataframe-wrap { | |
| max-height: 300px; /* Set the desired height */ | |
| overflow-y: scroll; | |
| } | |
| """ | |
| OUTPUT_DIR_value = "" | |
| dummy_data = df_brand_data = df_product_name_data = df_ingredients_data = df_nutritional_values_data = pd.DataFrame() | |
| #dummy_data, OUTPUT_DIR_value, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_roamler_excel_file("FDL-Datasets3/FR - Review.xlsm", n_rows=3) | |
| with gr.Blocks(css=custom_css) as fdl_data_extraction_ui: | |
| gr.HTML("<div align='center'><h1>Euroconsumers Food Data Lake</h1>") | |
| gr.HTML("<div align='center'><h2>Data extraction</h2>") | |
| OUTPUT_DIR = gr.State(value=OUTPUT_DIR_value) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.HTML("<h2>Upload Roamler Excel file</h2>") | |
| load_roamler_excel_file_input = gr.File(label="Upload Roamler Excel file", type="filepath") | |
| with gr.Row(visible=False) as dataset_block: | |
| with gr.Column(): | |
| gr.HTML("<h2>Dataset summary</h2>") | |
| # Display summary of the dataset - ID, Reference_brand, Reference_product_name, mean_accuracy_score | |
| with gr.Row(elem_classes="dataframe-wrap"): | |
| dataframe_component = gr.DataFrame(value=dummy_data, interactive=False) | |
| with gr.Row(visible=False) as product_detail_block: | |
| with gr.Column(): | |
| # Section for product details | |
| gr.HTML("<h1>Data extraction</h1>") | |
| load_csv_files_input = gr.Files(label="Upload extracted data from CSV files") | |
| language = gr.Dropdown(label="Select language", choices=["French", "Dutch", "Spanish", "Italian", "Portuguese"], value="French") | |
| gr.HTML("<h3>Brand</h3>") | |
| extract_brand_button = gr.Button("Extract brand") | |
| df_brand = gr.Dataframe(label="Brand data", scale=2, | |
| column_widths=["10%", "60%", "15%", "15%"], | |
| wrap=True, value=df_brand_data) | |
| gr.HTML("<h3>Product name</h3>") | |
| extract_product_name_button = gr.Button("Extract product_name") | |
| df_product_name = gr.Dataframe(label="Product name data", scale=2, | |
| column_widths=["10%", "60%", "15%", "15%"], | |
| wrap=True, value=df_product_name_data) | |
| gr.HTML("<h3>Ingredients</h3>") | |
| extract_ingredients_button = gr.Button("Extract ingredients") | |
| df_ingredients = gr.Dataframe(label="Ingredients data", scale=2, | |
| column_widths=["10%", "60%", "15%", "15%"], | |
| wrap=True, value=df_ingredients_data) | |
| gr.HTML("<h3>Nutritional values</h3>") | |
| extract_nutritional_values_button = gr.Button("Extract nutritional values") | |
| df_nutritional_values = gr.Dataframe(label="Nutritional data", scale=2, | |
| column_widths=["10%", "60%", "15%", "15%"], | |
| wrap=True, value=df_nutritional_values_data) | |
| # Download | |
| gr.HTML("<h1>Data download</h1>") | |
| generate_merged_file_button = gr.Button("Generate merged file") | |
| generate_archive_button = gr.Button("Generate data archive") | |
| download_button = gr.DownloadButton("Download archive", visible=False) | |
| ### Control functions | |
| # Linking the select_dataset change event to update both the gradio DataFrame and product_ids dropdown | |
| load_roamler_excel_file_input.change(load_roamler_excel_file, | |
| inputs=load_roamler_excel_file_input, | |
| outputs=[dataframe_component, OUTPUT_DIR, | |
| df_brand, df_product_name, df_ingredients, df_nutritional_values]) | |
| # Toggle visibility of the dataset block | |
| load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=dataset_block) | |
| load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=product_detail_block) | |
| load_csv_files_input.change(load_csv_files, | |
| inputs=[load_csv_files_input, OUTPUT_DIR], | |
| outputs=[df_brand, df_product_name, df_ingredients, df_nutritional_values]) | |
| # Data extraction | |
| extract_brand_button.click(utils_data_extraction.extract_brand, | |
| inputs=[OUTPUT_DIR, dataframe_component, language], | |
| outputs=df_brand) | |
| extract_product_name_button.click(utils_data_extraction.extract_product_name, | |
| inputs=[OUTPUT_DIR, dataframe_component, language], | |
| outputs=df_product_name) | |
| extract_ingredients_button.click(utils_data_extraction.extract_ingredients, | |
| inputs=[OUTPUT_DIR, dataframe_component, language], | |
| outputs=df_ingredients) | |
| extract_nutritional_values_button.click(utils_data_extraction.extract_nutritional_values, | |
| inputs=[OUTPUT_DIR, dataframe_component, language], | |
| outputs=df_nutritional_values) | |
| generate_merged_file_button.click(utils_assessment.merge_and_save_data, inputs=OUTPUT_DIR) | |
| generate_archive_button.click(generate_archive, inputs=OUTPUT_DIR, outputs=download_button) | |
| fdl_data_extraction_ui.launch(debug=True) | |