Spaces:
Sleeping
Sleeping
File size: 1,349 Bytes
abb054a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import streamlit as st
from pyspark.sql import SparkSession
from huggingface_hub import HfApi, hf_hub_download
import requests
# Initialize a Spark session
spark = SparkSession.builder.appName("HuggingFaceSpark").getOrCreate()
# Function to read Parquet files from Hugging Face
def read_parquet_from_hf(repo_id, filename):
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
df = spark.read.parquet(file_path)
return df
# Example use case: Load a dataset from Hugging Face and display it
def load_and_display_dataset():
st.title("Hugging Face Spark Integration")
repo_id = st.text_input("Enter Hugging Face Dataset Repo ID (e.g., 'james-burton/wine_reviews'):")
filename = st.text_input("Enter Dataset File Name (e.g., 'train/0.parquet'):")
if st.button("Load Dataset"):
if repo_id and filename:
try:
df = read_parquet_from_hf(repo_id, filename)
st.write("Dataset Schema:")
st.write(df.printSchema())
st.write("First 10 rows of the dataset:")
st.write(df.show(10))
except Exception as e:
st.error(f"Error loading dataset: {e}")
else:
st.error("Please enter both the repo ID and filename.")
if __name__ == "__main__":
load_and_display_dataset() |