Spaces:

jaothan
/

sparksession

Sleeping

jaothan commited on Feb 15, 2025

Commit

abb054a

verified ·

1 Parent(s): 5f2d295

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import streamlit as st
+from pyspark.sql import SparkSession
+from huggingface_hub import HfApi, hf_hub_download
+import requests
+# Initialize a Spark session
+spark = SparkSession.builder.appName("HuggingFaceSpark").getOrCreate()
+# Function to read Parquet files from Hugging Face
+def read_parquet_from_hf(repo_id, filename):
+    file_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    df = spark.read.parquet(file_path)
+    return df
+# Example use case: Load a dataset from Hugging Face and display it
+def load_and_display_dataset():
+    st.title("Hugging Face Spark Integration")
+    repo_id = st.text_input("Enter Hugging Face Dataset Repo ID (e.g., 'james-burton/wine_reviews'):")
+    filename = st.text_input("Enter Dataset File Name (e.g., 'train/0.parquet'):")
+    if st.button("Load Dataset"):
+        if repo_id and filename:
+            try:
+                df = read_parquet_from_hf(repo_id, filename)
+                st.write("Dataset Schema:")
+                st.write(df.printSchema())
+                st.write("First 10 rows of the dataset:")
+                st.write(df.show(10))
+            except Exception as e:
+                st.error(f"Error loading dataset: {e}")
+        else:
+            st.error("Please enter both the repo ID and filename.")
+if __name__ == "__main__":
+    load_and_display_dataset()