DevClivora commited on
Commit
8fe7a42
·
1 Parent(s): 457a594

added data upload files

Browse files
Files changed (3) hide show
  1. Dockerfile +25 -0
  2. app.py +139 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ # This file tells Hugging Face Spaces how to build and run the Flask application.
3
+
4
+ # 1. Start with a standard Python 3.9 base image.
5
+ FROM python:3.13-slim
6
+
7
+ # 2. Set the working directory inside the container.
8
+ WORKDIR /code
9
+
10
+ # 3. Copy the requirements file into the working directory.
11
+ COPY ./requirements.txt /code/requirements.txt
12
+
13
+ # 4. Install the Python dependencies specified in the requirements file.
14
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
15
+
16
+ # 5. Copy the rest of your application's code (e.g., app.py) into the working directory.
17
+ COPY ./ /code/
18
+
19
+ # 6. Expose the port that the application will run on.
20
+ # Hugging Face Spaces typically use port 7860.
21
+ EXPOSE 7860
22
+
23
+ # 7. Define the command to run the application using Gunicorn.
24
+ # Gunicorn is a robust production web server for Python.
25
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # A Flask API to be hosted on a Hugging Face Space.
3
+ # This API receives data, converts it to Parquet format, and uploads it to specified HF datasets.
4
+ # This version includes CORS support to be called directly from a frontend application.
5
+
6
+ import os
7
+ import pandas as pd
8
+ import io
9
+ from flask import Flask, request, jsonify
10
+ from flask_cors import CORS # Import CORS
11
+ from huggingface_hub import HfApi, HfFolder
12
+ from datetime import datetime
13
+
14
+ # --- Initialization ---
15
+ app = Flask(__name__)
16
+
17
+ # --- Enable CORS ---
18
+ # This allows the API to accept requests from different domains (i.e., your frontend).
19
+ # For production, you might want to restrict origins for better security.
20
+ # e.g., CORS(app, origins=["https://your-frontend-domain.com"])
21
+ CORS(app)
22
+
23
+ # --- Configuration ---
24
+ # It's crucial to get these values from the Hugging Face Space's secrets for security.
25
+ # DO NOT hardcode them here.
26
+ try:
27
+ HF_TOKEN = os.environ["HF_TOKEN"]
28
+ AUDIO_DATASET_REPO_ID = os.environ["AUDIO_DATASET_REPO_ID"] # E.g., "YourUsername/audio-clips-dataset"
29
+ TEXT_DATASET_REPO_ID = os.environ["TEXT_DATASET_REPO_ID"] # E.g., "YourUsername/transcription-summaries-dataset"
30
+ except KeyError as e:
31
+ print(f"FATAL ERROR: Missing secret environment variable: {e}")
32
+ # In a real scenario, the app should fail fast if config is missing.
33
+ HF_TOKEN, AUDIO_DATASET_REPO_ID, TEXT_DATASET_REPO_ID = None, None, None
34
+
35
+
36
+ # --- Hugging Face API Client ---
37
+ # Authenticate and initialize the Hub API client.
38
+ if HF_TOKEN:
39
+ HfFolder.save_token(HF_TOKEN)
40
+ api = HfApi()
41
+ else:
42
+ api = None
43
+ print("Warning: HfApi not initialized because HF_TOKEN is not set.")
44
+
45
+
46
+ # --- Helper Function ---
47
+ def get_unique_filename():
48
+ """Generates a unique filename based on the current timestamp to avoid collisions."""
49
+ return f"data_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}.parquet"
50
+
51
+
52
+ # --- API Endpoints ---
53
+
54
+ @app.route('/')
55
+ def index():
56
+ """A simple index route to confirm the API is running."""
57
+ return "Hugging Face Data Uploader API is running."
58
+
59
+ @app.route('/add-audio', methods=['POST'])
60
+ def add_audio_data():
61
+ """
62
+ Receives an audio/transcription pair, converts to Parquet, and uploads to the audio dataset.
63
+ Expected JSON payload: { "audio": "<base64_string>", "transcription": "<string>" }
64
+ """
65
+ if not api:
66
+ return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
67
+
68
+ try:
69
+ # 1. Get and validate the JSON data from the request
70
+ data = request.get_json()
71
+ if not data or 'audio' not in data or 'transcription' not in data:
72
+ return jsonify({"error": "Invalid payload. 'audio' and 'transcription' fields are required."}), 400
73
+
74
+ # 2. Convert the data into a Pandas DataFrame
75
+ df = pd.DataFrame([data])
76
+
77
+ # 3. Convert the DataFrame to a Parquet file in an in-memory buffer
78
+ buffer = io.BytesIO()
79
+ df.to_parquet(buffer, index=False, engine='pyarrow')
80
+ buffer.seek(0) # Rewind the buffer to the beginning
81
+
82
+ # 4. Upload the in-memory file to the Hugging Face Hub dataset repository
83
+ api.upload_file(
84
+ path_or_fileobj=buffer,
85
+ path_in_repo=get_unique_filename(),
86
+ repo_id=AUDIO_DATASET_REPO_ID,
87
+ repo_type="dataset",
88
+ commit_message="Add new audio-transcription pair"
89
+ )
90
+
91
+ return jsonify({"message": "Audio data added successfully."}), 201
92
+
93
+ except Exception as e:
94
+ print(f"Error in /add-audio: {e}")
95
+ return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500
96
+
97
+
98
+ @app.route('/add-text', methods=['POST'])
99
+ def add_text_data():
100
+ """
101
+ Receives a transcription/summary pair, converts to Parquet, and uploads to the text dataset.
102
+ Expected JSON payload: { "transcription": "<string>", "summary": "<string>" }
103
+ """
104
+ if not api:
105
+ return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
106
+
107
+ try:
108
+ # 1. Get and validate the JSON data
109
+ data = request.get_json()
110
+ if not data or 'transcription' not in data or 'summary' not in data:
111
+ return jsonify({"error": "Invalid payload. 'transcription' and 'summary' fields are required."}), 400
112
+
113
+ # 2. Convert to DataFrame
114
+ df = pd.DataFrame([data])
115
+
116
+ # 3. Convert to in-memory Parquet file
117
+ buffer = io.BytesIO()
118
+ df.to_parquet(buffer, index=False, engine='pyarrow')
119
+ buffer.seek(0)
120
+
121
+ # 4. Upload the file to the Hub
122
+ api.upload_file(
123
+ path_or_fileobj=buffer,
124
+ path_in_repo=get_unique_filename(),
125
+ repo_id=TEXT_DATASET_REPO_ID,
126
+ repo_type="dataset",
127
+ commit_message="Add new transcription-summary pair"
128
+ )
129
+
130
+ return jsonify({"message": "Text data added successfully."}), 201
131
+
132
+ except Exception as e:
133
+ print(f"Error in /add-text: {e}")
134
+ return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500
135
+
136
+ # To run on Hugging Face Spaces, the app needs to be runnable.
137
+ # The standard port for Spaces is 7860.
138
+ if __name__ == '__main__':
139
+ app.run(host='0.0.0.0', port=7860)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ # Libraries needed to run the Flask API on Hugging Face Spaces.
3
+
4
+ Flask
5
+ pandas
6
+ huggingface_hub
7
+ pyarrow
8
+ gunicorn
9
+ flask-cors