File size: 1,349 Bytes
abb054a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import streamlit as st
from pyspark.sql import SparkSession
from huggingface_hub import HfApi, hf_hub_download
import requests

# Initialize a Spark session
spark = SparkSession.builder.appName("HuggingFaceSpark").getOrCreate()

# Function to read Parquet files from Hugging Face
def read_parquet_from_hf(repo_id, filename):
    file_path = hf_hub_download(repo_id=repo_id, filename=filename)
    df = spark.read.parquet(file_path)
    return df

# Example use case: Load a dataset from Hugging Face and display it
def load_and_display_dataset():
    st.title("Hugging Face Spark Integration")
    
    repo_id = st.text_input("Enter Hugging Face Dataset Repo ID (e.g., 'james-burton/wine_reviews'):")
    filename = st.text_input("Enter Dataset File Name (e.g., 'train/0.parquet'):")
    
    if st.button("Load Dataset"):
        if repo_id and filename:
            try:
                df = read_parquet_from_hf(repo_id, filename)
                st.write("Dataset Schema:")
                st.write(df.printSchema())
                st.write("First 10 rows of the dataset:")
                st.write(df.show(10))
            except Exception as e:
                st.error(f"Error loading dataset: {e}")
        else:
            st.error("Please enter both the repo ID and filename.")

if __name__ == "__main__":
    load_and_display_dataset()