Spaces:

Clivora
/

data-upload

Sleeping

App Files Files Community

DevClivora commited on Jul 9

Commit

8fe7a42

1 Parent(s): 457a594

added data upload files

Browse files

Files changed (3) hide show

Dockerfile +25 -0
app.py +139 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Dockerfile
+# This file tells Hugging Face Spaces how to build and run the Flask application.
+# 1. Start with a standard Python 3.9 base image.
+FROM python:3.13-slim
+# 2. Set the working directory inside the container.
+WORKDIR /code
+# 3. Copy the requirements file into the working directory.
+COPY ./requirements.txt /code/requirements.txt
+# 4. Install the Python dependencies specified in the requirements file.
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# 5. Copy the rest of your application's code (e.g., app.py) into the working directory.
+COPY ./ /code/
+# 6. Expose the port that the application will run on.
+# Hugging Face Spaces typically use port 7860.
+EXPOSE 7860
+# 7. Define the command to run the application using Gunicorn.
+# Gunicorn is a robust production web server for Python.
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# app.py
+# A Flask API to be hosted on a Hugging Face Space.
+# This API receives data, converts it to Parquet format, and uploads it to specified HF datasets.
+# This version includes CORS support to be called directly from a frontend application.
+import os
+import pandas as pd
+import io
+from flask import Flask, request, jsonify
+from flask_cors import CORS  # Import CORS
+from huggingface_hub import HfApi, HfFolder
+from datetime import datetime
+# --- Initialization ---
+app = Flask(__name__)
+# --- Enable CORS ---
+# This allows the API to accept requests from different domains (i.e., your frontend).
+# For production, you might want to restrict origins for better security.
+# e.g., CORS(app, origins=["https://your-frontend-domain.com"])
+CORS(app)
+# --- Configuration ---
+# It's crucial to get these values from the Hugging Face Space's secrets for security.
+# DO NOT hardcode them here.
+try:
+    HF_TOKEN = os.environ["HF_TOKEN"]
+    AUDIO_DATASET_REPO_ID = os.environ["AUDIO_DATASET_REPO_ID"] # E.g., "YourUsername/audio-clips-dataset"
+    TEXT_DATASET_REPO_ID = os.environ["TEXT_DATASET_REPO_ID"]   # E.g., "YourUsername/transcription-summaries-dataset"
+except KeyError as e:
+    print(f"FATAL ERROR: Missing secret environment variable: {e}")
+    # In a real scenario, the app should fail fast if config is missing.
+    HF_TOKEN, AUDIO_DATASET_REPO_ID, TEXT_DATASET_REPO_ID = None, None, None
+# --- Hugging Face API Client ---
+# Authenticate and initialize the Hub API client.
+if HF_TOKEN:
+    HfFolder.save_token(HF_TOKEN)
+    api = HfApi()
+else:
+    api = None
+    print("Warning: HfApi not initialized because HF_TOKEN is not set.")
+# --- Helper Function ---
+def get_unique_filename():
+    """Generates a unique filename based on the current timestamp to avoid collisions."""
+    return f"data_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}.parquet"
+# --- API Endpoints ---
+@app.route('/')
+def index():
+    """A simple index route to confirm the API is running."""
+    return "Hugging Face Data Uploader API is running."
+@app.route('/add-audio', methods=['POST'])
+def add_audio_data():
+    """
+    Receives an audio/transcription pair, converts to Parquet, and uploads to the audio dataset.
+    Expected JSON payload: { "audio": "<base64_string>", "transcription": "<string>" }
+    """
+    if not api:
+        return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
+    try:
+        # 1. Get and validate the JSON data from the request
+        data = request.get_json()
+        if not data or 'audio' not in data or 'transcription' not in data:
+            return jsonify({"error": "Invalid payload. 'audio' and 'transcription' fields are required."}), 400
+        # 2. Convert the data into a Pandas DataFrame
+        df = pd.DataFrame([data])
+        # 3. Convert the DataFrame to a Parquet file in an in-memory buffer
+        buffer = io.BytesIO()
+        df.to_parquet(buffer, index=False, engine='pyarrow')
+        buffer.seek(0) # Rewind the buffer to the beginning
+        # 4. Upload the in-memory file to the Hugging Face Hub dataset repository
+        api.upload_file(
+            path_or_fileobj=buffer,
+            path_in_repo=get_unique_filename(),
+            repo_id=AUDIO_DATASET_REPO_ID,
+            repo_type="dataset",
+            commit_message="Add new audio-transcription pair"
+        )
+        return jsonify({"message": "Audio data added successfully."}), 201
+    except Exception as e:
+        print(f"Error in /add-audio: {e}")
+        return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500
+@app.route('/add-text', methods=['POST'])
+def add_text_data():
+    """
+    Receives a transcription/summary pair, converts to Parquet, and uploads to the text dataset.
+    Expected JSON payload: { "transcription": "<string>", "summary": "<string>" }
+    """
+    if not api:
+        return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
+    try:
+        # 1. Get and validate the JSON data
+        data = request.get_json()
+        if not data or 'transcription' not in data or 'summary' not in data:
+            return jsonify({"error": "Invalid payload. 'transcription' and 'summary' fields are required."}), 400
+        # 2. Convert to DataFrame
+        df = pd.DataFrame([data])
+        # 3. Convert to in-memory Parquet file
+        buffer = io.BytesIO()
+        df.to_parquet(buffer, index=False, engine='pyarrow')
+        buffer.seek(0)
+        # 4. Upload the file to the Hub
+        api.upload_file(
+            path_or_fileobj=buffer,
+            path_in_repo=get_unique_filename(),
+            repo_id=TEXT_DATASET_REPO_ID,
+            repo_type="dataset",
+            commit_message="Add new transcription-summary pair"
+        )
+        return jsonify({"message": "Text data added successfully."}), 201
+    except Exception as e:
+        print(f"Error in /add-text: {e}")
+        return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500
+# To run on Hugging Face Spaces, the app needs to be runnable.
+# The standard port for Spaces is 7860.
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# requirements.txt
+# Libraries needed to run the Flask API on Hugging Face Spaces.
+Flask
+pandas
+huggingface_hub
+pyarrow
+gunicorn
+flask-cors