jaothan commited on
Commit
abb054a
·
verified ·
1 Parent(s): 5f2d295

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -0
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pyspark.sql import SparkSession
3
+ from huggingface_hub import HfApi, hf_hub_download
4
+ import requests
5
+
6
+ # Initialize a Spark session
7
+ spark = SparkSession.builder.appName("HuggingFaceSpark").getOrCreate()
8
+
9
+ # Function to read Parquet files from Hugging Face
10
+ def read_parquet_from_hf(repo_id, filename):
11
+ file_path = hf_hub_download(repo_id=repo_id, filename=filename)
12
+ df = spark.read.parquet(file_path)
13
+ return df
14
+
15
+ # Example use case: Load a dataset from Hugging Face and display it
16
+ def load_and_display_dataset():
17
+ st.title("Hugging Face Spark Integration")
18
+
19
+ repo_id = st.text_input("Enter Hugging Face Dataset Repo ID (e.g., 'james-burton/wine_reviews'):")
20
+ filename = st.text_input("Enter Dataset File Name (e.g., 'train/0.parquet'):")
21
+
22
+ if st.button("Load Dataset"):
23
+ if repo_id and filename:
24
+ try:
25
+ df = read_parquet_from_hf(repo_id, filename)
26
+ st.write("Dataset Schema:")
27
+ st.write(df.printSchema())
28
+ st.write("First 10 rows of the dataset:")
29
+ st.write(df.show(10))
30
+ except Exception as e:
31
+ st.error(f"Error loading dataset: {e}")
32
+ else:
33
+ st.error("Please enter both the repo ID and filename.")
34
+
35
+ if __name__ == "__main__":
36
+ load_and_display_dataset()