Spaces:

jhansi1
/

train

Sleeping

App Files Files Community

jhansi1 commited on Nov 8, 2024

Commit

db1baea

verified ·

1 Parent(s): 009fbcd

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -6

app.py CHANGED Viewed

@@ -3,16 +3,30 @@ import streamlit as st
 from transformers import pipeline
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download
-from datasets import load_dataset
 # Initialize text-generation pipeline with the model
 model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
 pipe = pipeline("text-generation", model=model_name)
-# Load the dataset from the cloned local direc/tory
-ds = load_dataset("./canadian-legal-data", split="train",verify=False)
 # Gradio Interface setup
 def respond(
@@ -51,9 +65,10 @@ def streamlit_interface():
     st.title("Canadian Legal Text Generator")
     st.write("Enter a prompt related to Canadian legal data and generate text using Llama-3.1.")
-    # Show dataset sample
     st.subheader("Sample Data from Canadian Legal Dataset:")
-    st.write(ds[:5])  # Display the first 5 rows of the dataset
     # Prompt input
     prompt = st.text_area("Enter your prompt:", placeholder="Type something...")
@@ -86,4 +101,4 @@ if __name__ == "__main__":
                 gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
             ],
         )
-        demo.launch()

 from transformers import pipeline
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download
+import subprocess
+import os
+# Clone the dataset repository if not already cloned
+repo_url = "https://huggingface.co/datasets/BEE-spoke-data/survivorslib-law-books"
+repo_dir = "./survivorslib-law-books"
+if not os.path.exists(repo_dir):
+    subprocess.run(["git", "clone", repo_url], check=True)
+# Load the dataset from the cloned repository
+dataset_path = os.path.join(repo_dir, "train.parquet")
+ds = load_dataset("parquet", data_files=dataset_path)
 # Initialize text-generation pipeline with the model
 model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
 pipe = pipeline("text-generation", model=model_name)
+# Preprocess dataset (assuming it has a 'text' or 'content' column for feeding to the model)
+# If the dataset is different, update the field names accordingly
+def preprocess_data(dataset):
+    # Here, we assume that the dataset has a 'content' column with legal text
+    # Adjust the column name as needed (for example, it might be 'text' or 'paragraph')
+    return dataset['content'][:5]  # Displaying only the first 5 entries for brevity
 # Gradio Interface setup
 def respond(
     st.title("Canadian Legal Text Generator")
     st.write("Enter a prompt related to Canadian legal data and generate text using Llama-3.1.")
+    # Show dataset sample (first 5 entries)
     st.subheader("Sample Data from Canadian Legal Dataset:")
+    sample_data = preprocess_data(ds['train'])  # Assuming 'train' split
+    st.write(sample_data)  # Display the first 5 rows of the dataset
     # Prompt input
     prompt = st.text_area("Enter your prompt:", placeholder="Type something...")
                 gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
             ],
         )
+        demo.launch()