data-upload / app.py
DevClivora's picture
Update app.py
a57c24b verified
# app.py
# A Flask API updated to use the proper Flask logger instead of print().
import os
import pandas as pd
import io
from flask import Flask, request, jsonify
from flask_cors import CORS
from huggingface_hub import HfApi
from datetime import datetime
import logging
# --- Initialization ---
app = Flask(__name__)
CORS(app)
# Configure logging
app.logger.setLevel(logging.INFO)
app.logger.info("--- Flask app.py is starting up! ---")
# --- Configuration ---
app.logger.info("Loading environment variables...")
try:
HF_TOKEN = os.environ["HF_TOKEN"]
AUDIO_DATASET_REPO_ID = os.environ["AUDIO_DATASET_REPO_ID"]
TEXT_DATASET_REPO_ID = os.environ["TEXT_DATASET_REPO_ID"]
app.logger.info("Successfully loaded all required environment variables.")
except KeyError as e:
app.logger.error(f"FATAL ERROR: Missing secret environment variable: {e}")
HF_TOKEN, AUDIO_DATASET_REPO_ID, TEXT_DATASET_REPO_ID = None, None, None
# --- Hugging Face API Client ---
if HF_TOKEN:
app.logger.info("Initializing HfApi client...")
api = HfApi(token=HF_TOKEN)
app.logger.info("HfApi client initialized.")
else:
api = None
app.logger.warning("Warning: HfApi not initialized because HF_TOKEN is not set.")
# --- Helper Function ---
def get_unique_filename():
"""Generates a unique filename based on the current timestamp to avoid collisions."""
return f"data_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}.parquet"
# --- API Endpoints ---
@app.route('/')
def index():
"""A simple index route to confirm the API is running."""
app.logger.info("Request received for / route.")
return "Hugging Face Data Uploader API is running."
@app.route('/add-audio', methods=['POST'])
def add_audio_data():
"""
Receives an audio/transcription pair, converts to Parquet, and uploads to the audio dataset.
"""
app.logger.info("Request received for /add-audio route.")
if not api:
app.logger.error("API client not available for /add-audio.")
return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
try:
app.logger.info("Attempting to process /add-audio data...")
data = request.get_json()
if not data or 'audio' not in data or 'transcription' not in data:
app.logger.warning("Invalid payload received for /add-audio.")
return jsonify({"error": "Invalid payload. 'audio' and 'transcription' fields are required."}), 400
app.logger.info("Data validated. Creating DataFrame.")
df = pd.DataFrame([data])
buffer = io.BytesIO()
df.to_parquet(buffer, index=False, engine='pyarrow')
buffer.seek(0)
app.logger.info(f"Uploading file to audio dataset: {AUDIO_DATASET_REPO_ID}")
api.upload_file(
path_or_fileobj=buffer,
path_in_repo=get_unique_filename(),
repo_id=AUDIO_DATASET_REPO_ID,
repo_type="dataset",
commit_message="Add new audio-transcription pair"
)
app.logger.info("File successfully uploaded to audio dataset.")
return jsonify({"message": "Audio data added successfully."}), 201
except Exception as e:
app.logger.error(f"---! UNEXPECTED ERROR in /add-audio !---: {e}", exc_info=True)
return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500
@app.route('/add-text', methods=['POST'])
def add_text_data():
"""
Receives a transcription/summary pair, converts to Parquet, and uploads to the text dataset.
"""
app.logger.info("Request received for /add-text route.")
if not api:
app.logger.error("API client not available for /add-text.")
return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
try:
app.logger.info("Attempting to process /add-text data...")
data = request.get_json()
if not data or 'transcription' not in data or 'summary' not in data:
app.logger.warning("Invalid payload received for /add-text.")
return jsonify({"error": "Invalid payload. 'transcription' and 'summary' fields are required."}), 400
app.logger.info("Data validated. Creating DataFrame.")
df = pd.DataFrame([data])
buffer = io.BytesIO()
df.to_parquet(buffer, index=False, engine='pyarrow')
buffer.seek(0)
app.logger.info(f"Uploading file to text dataset: {TEXT_DATASET_REPO_ID}")
api.upload_file(
path_or_fileobj=buffer,
path_in_repo=get_unique_filename(),
repo_id=TEXT_DATASET_REPO_ID,
repo_type="dataset",
commit_message="Add new transcription-summary pair"
)
app.logger.info("File successfully uploaded to text dataset.")
return jsonify({"message": "Text data added successfully."}), 201
except Exception as e:
app.logger.error(f"---! UNEXPECTED ERROR in /add-text !---: {e}", exc_info=True)
return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500
# To run on Hugging Face Spaces
if __name__ == '__main__':
app.logger.info("Starting Flask development server...")
app.run(host='0.0.0.0', port=7860)