Spaces:

precison9
/

fastapi

Sleeping

App Files Files Community

precison9 commited on May 17, 2025

Commit

2b85b0c

1 Parent(s): 9879e27

Add Dockerized Flask application files for deployment

Browse files

Files changed (19) hide show

Dockerfile +66 -0
flask_Character.py +31 -0
requirements.txt +7 -0
utils/__pycache__/config.cpython-311.pyc +0 -0
utils/__pycache__/config.cpython-312.pyc +0 -0
utils/__pycache__/generate_face_shapes.cpython-311.pyc +0 -0
utils/__pycache__/generate_face_shapes.cpython-312.pyc +0 -0
utils/audio/extraction/__pycache__/extract_features.cpython-311.pyc +0 -0
utils/audio/extraction/__pycache__/extract_features.cpython-312.pyc +0 -0
utils/audio/extraction/extract_features.py +245 -0
utils/audio/processing/__pycache__/audio_processing.cpython-311.pyc +0 -0
utils/audio/processing/__pycache__/audio_processing.cpython-312.pyc +0 -0
utils/audio/processing/audio_processing.py +166 -0
utils/config.py +12 -0
utils/generate_face_shapes.py +22 -0
utils/model/__pycache__/model.cpython-311.pyc +0 -0
utils/model/__pycache__/model.cpython-312.pyc +0 -0
utils/model/model.pth +3 -0
utils/model/model.py +256 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,66 @@

+# Use an official Python runtime as a parent image
+# We are using a lightweight Python 3.9 image.
+# Using a specific version tag (like 3.9-slim) is recommended for stability
+# and to ensure your build is reproducible.
+FROM python:3.9-slim
+# Set environment variables
+# These environment variables are commonly used for Python applications
+# running in Docker to ensure output is unbuffered and to manage pip behavior.
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100
+# Set the working directory in the container
+# This sets the current directory inside the container to /app.
+# All subsequent commands like COPY, RUN, and CMD will be executed relative to this directory.
+WORKDIR /app
+# Copy the requirements file first to leverage Docker's layer caching
+# This is an optimization. If only your code changes, Docker can use the cached
+# layer for dependency installation, speeding up subsequent builds.
+COPY requirements.txt .
+# Install the Python dependencies
+# This command executes during the image build. It reads the requirements.txt file
+# and installs all the listed Python packages using pip.
+# Make sure you have 'Flask', 'numpy', 'torch', 'gunicorn', and any other
+# necessary libraries listed in your requirements.txt file.
+RUN pip install -r requirements.txt
+# Copy the rest of your application code into the container
+# This copies all other files and directories from your local project's root
+# (where the Dockerfile is located) into the /app directory inside the container.
+# This includes your main Flask file (your_app.py), the utils directory,
+# the model files (utils/model/model.pth, utils/model/model.py), etc.
+COPY . /app
+# Expose the port your Flask app will run on
+# This instruction informs Docker that the container listens on port 7860.
+# Hugging Face Spaces Docker SDK typically expects applications to listen on port 7860.
+# This doesn't actually publish the port, but serves as documentation.
+EXPOSE 7860
+# Command to run your application when the container launches
+# This is the default command that will be executed when a container is started
+# from this image. We use gunicorn, a popular WSGI server for Python web apps.
+# It tells gunicorn to run the 'app' object found in your 'your_app.py' file.
+# '--workers 4': Specifies the number of worker processes for gunicorn. Adjust as needed.
+# '--bind 0.0.0.0:7860': Binds gunicorn to all network interfaces on port 7860.
+# 'your_app:app': The format is [module_name]:[variable_name].
+#   - 'your_app' should be the name of your main Python file (without the .py extension).
+#   - 'app' should be the name of your Flask application instance
+#     (e.g., app = flask.Flask(__name__)).
+CMD ["gunicorn", "--workers", "4", "--bind", "0.0.0.0:7860", "flask_Character:app"]
+# Important Note:
+# Before building your Docker image, make sure to remove the following block
+# from your main Flask application file (your_app.py):
+#
+# if __name__ == '__main__':
+#     app.run(host='127.0.0.1', port=5000)
+#
+# This block is for running the Flask development server directly,
+# but when using a WSGI server like Gunicorn in production (or in Docker),
+# Gunicorn handles starting the application.

flask_Character.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# This software is licensed under a **dual-license model**
+# For individuals and businesses earning **under $1M per year**, this software is licensed under the **MIT License**
+# Businesses or organizations with **annual revenue of $1,000,000 or more** must obtain permission to use this software commercially.
+from flask import request, jsonify
+import numpy as np
+import torch
+import flask
+from utils.generate_face_shapes import generate_facial_data_from_bytes
+from utils.model.model import load_model
+from utils.config import config
+app = flask.Flask(__name__)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print("Activated device:", device)
+model_path = 'utils/model/model.pth'
+blendshape_model = load_model(model_path, config, device)
+@app.route('/audio_to_blendshapes', methods=['POST'])
+def audio_to_blendshapes_route():
+    audio_bytes = request.data
+    generated_facial_data = generate_facial_data_from_bytes(audio_bytes, blendshape_model, device, config)
+    generated_facial_data_list = generated_facial_data.tolist() if isinstance(generated_facial_data, np.ndarray) else generated_facial_data
+    return jsonify({'blendshapes': generated_facial_data_list})

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Flask
+numpy
+torch
+gunicorn
+librosa
+scipy

utils/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (403 Bytes). View file

utils/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (392 Bytes). View file

utils/__pycache__/generate_face_shapes.cpython-311.pyc ADDED Viewed

Binary file (882 Bytes). View file

utils/__pycache__/generate_face_shapes.cpython-312.pyc ADDED Viewed

Binary file (798 Bytes). View file

utils/audio/extraction/__pycache__/extract_features.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

utils/audio/extraction/__pycache__/extract_features.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

utils/audio/extraction/extract_features.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# This software is licensed under a **dual-license model**
+# For individuals and businesses earning **under $1M per year**, this software is licensed under the **MIT License**
+# Businesses or organizations with **annual revenue of $1,000,000 or more** must obtain permission to use this software commercially.
+# extract_features.py
+import io
+import librosa
+import numpy as np
+import scipy.signal
+def extract_audio_features(audio_input, sr=88200, from_bytes=False):
+    try:
+        if from_bytes:
+            y, sr = load_audio_from_bytes(audio_input, sr)
+        else:
+            y, sr = load_and_preprocess_audio(audio_input, sr)
+    except Exception as e:
+            print(f"Loading as WAV failed: {e}\nFalling back to PCM loading.")
+            y = load_pcm_audio_from_bytes(audio_input)
+    frame_length = int(0.01667 * sr)  # Frame length set to 0.01667 seconds (~60 fps)
+    hop_length = frame_length // 2  # 2x overlap for smoother transitions
+    min_frames = 9  # Minimum number of frames needed for delta calculation
+    num_frames = (len(y) - frame_length) // hop_length + 1
+    if num_frames < min_frames:
+        print(f"Audio file is too short: {num_frames} frames, required: {min_frames} frames")
+        return None, None
+    combined_features = extract_and_combine_features(y, sr, frame_length, hop_length)
+    return combined_features, y
+def extract_and_combine_features(y, sr, frame_length, hop_length, include_autocorr=True):
+    all_features = []
+    mfcc_features = extract_mfcc_features(y, sr, frame_length, hop_length)
+    all_features.append(mfcc_features)
+    if include_autocorr:
+        autocorr_features = extract_autocorrelation_features(
+            y, sr, frame_length, hop_length
+        )
+        all_features.append(autocorr_features)
+    combined_features = np.hstack(all_features)
+    return combined_features
+def extract_mfcc_features(y, sr, frame_length, hop_length, num_mfcc=23):
+    mfcc_features = extract_overlapping_mfcc(y, sr, num_mfcc, frame_length, hop_length)
+    reduced_mfcc_features = reduce_features(mfcc_features)
+    return reduced_mfcc_features.T
+def cepstral_mean_variance_normalization(mfcc):
+    mean = np.mean(mfcc, axis=1, keepdims=True)
+    std = np.std(mfcc, axis=1, keepdims=True)
+    return (mfcc - mean) / (std + 1e-10)
+def extract_overlapping_mfcc(chunk, sr, num_mfcc, frame_length, hop_length, include_deltas=True, include_cepstral=True, threshold=1e-5):
+    mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=num_mfcc, n_fft=frame_length, hop_length=hop_length)
+    if include_cepstral:
+        mfcc = cepstral_mean_variance_normalization(mfcc)
+    if include_deltas:
+        delta_mfcc = librosa.feature.delta(mfcc)
+        delta2_mfcc = librosa.feature.delta(mfcc, order=2)
+        combined_mfcc = np.vstack([mfcc, delta_mfcc, delta2_mfcc])  # Stack original MFCCs with deltas
+        return combined_mfcc
+    else:
+        return mfcc
+def reduce_features(features):
+    num_frames = features.shape[1]
+    paired_frames = features[:, :num_frames // 2 * 2].reshape(features.shape[0], -1, 2)
+    reduced_frames = paired_frames.mean(axis=2)
+    if num_frames % 2 == 1:
+        last_frame = features[:, -1].reshape(-1, 1)
+        reduced_final_features = np.hstack((reduced_frames, last_frame))
+    else:
+        reduced_final_features = reduced_frames
+    return reduced_final_features
+def extract_overlapping_autocorr(y, sr, frame_length, hop_length, num_autocorr_coeff=187, pad_signal=True, padding_mode="reflect", trim_padded=False):
+    if pad_signal:
+        pad = frame_length // 2
+        y_padded = np.pad(y, pad_width=pad, mode=padding_mode)
+    else:
+        y_padded = y
+    frames = librosa.util.frame(y_padded, frame_length=frame_length, hop_length=hop_length)
+    if pad_signal and trim_padded:
+        num_frames = frames.shape[1]
+        start_indices = np.arange(num_frames) * hop_length
+        valid_idx = np.where((start_indices >= pad) & (start_indices + frame_length <= len(y) + pad))[0]
+        frames = frames[:, valid_idx]
+    frames = frames - np.mean(frames, axis=0, keepdims=True)
+    hann_window = np.hanning(frame_length)
+    windowed_frames = frames * hann_window[:, np.newaxis]
+    autocorr_list = []
+    for frame in windowed_frames.T:
+        full_corr = np.correlate(frame, frame, mode='full')
+        mid = frame_length - 1  # Zero-lag index.
+        # Extract `num_autocorr_coeff + 1` to include the first column initially
+        wanted = full_corr[mid: mid + num_autocorr_coeff + 1]
+        # Normalize by the zero-lag (energy) if nonzero.
+        if wanted[0] != 0:
+            wanted = wanted / wanted[0]
+        autocorr_list.append(wanted)
+    # Convert list to array and transpose so that shape is (num_autocorr_coeff + 1, num_valid_frames)
+    autocorr_features = np.array(autocorr_list).T
+    # Remove the first coefficient to avoid redundancy
+    autocorr_features = autocorr_features[1:, :]
+    autocorr_features = fix_edge_frames_autocorr(autocorr_features)
+    return autocorr_features
+def fix_edge_frames_autocorr(autocorr_features, zero_threshold=1e-7):
+    """If the first or last frame is near all-zero, replicate from adjacent frames."""
+    # Check first frame energy
+    if np.all(np.abs(autocorr_features[:, 0]) < zero_threshold):
+        autocorr_features[:, 0] = autocorr_features[:, 1]
+    # Check last frame energy
+    if np.all(np.abs(autocorr_features[:, -1]) < zero_threshold):
+        autocorr_features[:, -1] = autocorr_features[:, -2]
+    return autocorr_features
+def extract_autocorrelation_features(
+    y, sr, frame_length, hop_length, include_deltas=False
+):
+    """
+    Extract autocorrelation features, optionally with deltas/delta-deltas,
+    then align with the MFCC frame count, reduce, and handle first/last frames.
+    """
+    autocorr_features = extract_overlapping_autocorr(
+        y, sr, frame_length, hop_length
+    )
+    if include_deltas:
+        autocorr_features = compute_autocorr_with_deltas(autocorr_features)
+    autocorr_features_reduced = reduce_features(autocorr_features)
+    return autocorr_features_reduced.T
+def compute_autocorr_with_deltas(autocorr_base):
+    delta_ac = librosa.feature.delta(autocorr_base)
+    delta2_ac = librosa.feature.delta(autocorr_base, order=2)
+    combined_autocorr = np.vstack([autocorr_base, delta_ac, delta2_ac])
+    return combined_autocorr
+def load_and_preprocess_audio(audio_path, sr=88200):
+    y, sr = load_audio(audio_path, sr)
+    if sr != 88200:
+        y = librosa.resample(y, orig_sr=sr, target_sr=88200)
+        sr = 88200
+    max_val = np.max(np.abs(y))
+    if max_val > 0:
+        y = y / max_val
+    return y, sr
+def load_audio(audio_path, sr=88200):
+    y, sr = librosa.load(audio_path, sr=sr)
+    print(f"Loaded audio file '{audio_path}' with sample rate {sr}")
+    return y, sr
+def load_audio_from_bytes(audio_bytes, sr=88200):
+    audio_file = io.BytesIO(audio_bytes)
+    y, sr = librosa.load(audio_file, sr=sr)
+    max_val = np.max(np.abs(y))
+    if max_val > 0:
+        y = y / max_val
+    return y, sr
+def load_audio_file_from_memory(audio_bytes, sr=88200):
+    """Load audio from memory bytes."""
+    y, sr = librosa.load(io.BytesIO(audio_bytes), sr=sr)
+    print(f"Loaded audio data with sample rate {sr}")
+    max_val = np.max(np.abs(y))
+    if max_val > 0:
+        y = y / max_val
+    return y, sr
+def load_pcm_audio_from_bytes(audio_bytes, sr=22050, channels=1, sample_width=2):
+    """
+    Load raw PCM bytes into a normalized numpy array and upsample to 88200 Hz.
+    Assumes little-endian, 16-bit PCM data.
+    """
+    # Determine the appropriate numpy dtype.
+    if sample_width == 2:
+        dtype = np.int16
+        max_val = 32768.0
+    else:
+        raise ValueError("Unsupported sample width")
+    # Convert bytes to numpy array.
+    data = np.frombuffer(audio_bytes, dtype=dtype)
+    # If stereo or more channels, reshape accordingly.
+    if channels > 1:
+        data = data.reshape(-1, channels)
+    # Normalize the data to range [-1, 1]
+    y = data.astype(np.float32) / max_val
+    # Upsample the audio from the current sample rate to 88200 Hz.
+    target_sr = 88200
+    if sr != target_sr:
+        # Calculate the number of samples in the resampled signal.
+        num_samples = int(len(y) * target_sr / sr)
+        if channels > 1:
+            # Resample each channel separately.
+            y_resampled = np.zeros((num_samples, channels), dtype=np.float32)
+            for ch in range(channels):
+                y_resampled[:, ch] = scipy.signal.resample(y[:, ch], num_samples)
+        else:
+            y_resampled = scipy.signal.resample(y, num_samples)
+        y = y_resampled
+        sr = target_sr
+    return y

utils/audio/processing/__pycache__/audio_processing.cpython-311.pyc ADDED Viewed

Binary file (7.74 kB). View file

utils/audio/processing/__pycache__/audio_processing.cpython-312.pyc ADDED Viewed

Binary file (6.85 kB). View file

utils/audio/processing/audio_processing.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# This software is licensed under a **dual-license model**
+# For individuals and businesses earning **under $1M per year**, this software is licensed under the **MIT License**
+# Businesses or organizations with **annual revenue of $1,000,000 or more** must obtain permission to use this software commercially.
+# audio_processing.py
+import numpy as np
+import torch
+from torch.cuda.amp import autocast
+def decode_audio_chunk(audio_chunk, model, device, config):
+    # Use precision based on config
+    use_half_precision = config.get("use_half_precision", True)
+    # Force float16 if half precision is desired; else float32
+    dtype = torch.float16 if use_half_precision else torch.float32
+    # Convert audio chunk directly to the desired precision
+    src_tensor = torch.tensor(audio_chunk, dtype=dtype).unsqueeze(0).to(device)
+    with torch.no_grad():
+        if use_half_precision:
+            with autocast(dtype=torch.float16):
+                encoder_outputs = model.encoder(src_tensor)
+                output_sequence = model.decoder(encoder_outputs)
+        else:
+            encoder_outputs = model.encoder(src_tensor)
+            output_sequence = model.decoder(encoder_outputs)
+        # Convert output tensor back to numpy array
+        decoded_outputs = output_sequence.squeeze(0).cpu().numpy()
+    return decoded_outputs
+def concatenate_outputs(all_decoded_outputs, num_frames):
+    final_decoded_outputs = np.concatenate(all_decoded_outputs, axis=0)
+    final_decoded_outputs = final_decoded_outputs[:num_frames]
+    return final_decoded_outputs
+def ensure_2d(final_decoded_outputs):
+    if final_decoded_outputs.ndim == 3:
+        final_decoded_outputs = final_decoded_outputs.reshape(-1, final_decoded_outputs.shape[-1])
+    return final_decoded_outputs
+def pad_audio_chunk(audio_chunk, frame_length, num_features, pad_mode='replicate'):
+    """
+    Pads the audio_chunk to ensure it has a number of frames equal to frame_length.
+    Parameters:
+        audio_chunk (np.array): Input audio data with shape (num_frames, num_features).
+        frame_length (int): Desired number of frames.
+        num_features (int): Number of features per frame.
+        pad_mode (str): Type of padding to use. Options are:
+                        - 'reflect': Pads using reflection.
+                        - 'replicate': Pads by replicating the last frame.
+    Returns:
+        np.array: Padded audio_chunk with shape (frame_length, num_features).
+    """
+    if audio_chunk.shape[0] < frame_length:
+        pad_length = frame_length - audio_chunk.shape[0]
+        if pad_mode == 'reflect':
+            # --- Original reflect padding method ---
+            padding = np.pad(
+                audio_chunk,
+                pad_width=((0, pad_length), (0, 0)),
+                mode='reflect'
+            )
+            # Using the last pad_length frames from the reflected padding
+            audio_chunk = np.vstack((audio_chunk, padding[-pad_length:, :num_features]))
+        elif pad_mode == 'replicate':
+            # --- New replicate padding method ---
+            # Replicate the last frame to fill the remaining frames
+            last_frame = audio_chunk[-1:]  # Select the last frame (shape: (1, num_features))
+            replication = np.tile(last_frame, (pad_length, 1))  # Replicate it pad_length times
+            audio_chunk = np.vstack((audio_chunk, replication))
+        else:
+            raise ValueError(f"Unsupported pad_mode: {pad_mode}. Choose 'reflect' or 'replicate'.")
+    return audio_chunk
+def blend_chunks(chunk1, chunk2, overlap):
+    actual_overlap = min(overlap, len(chunk1), len(chunk2))
+    if actual_overlap == 0:
+        return np.vstack((chunk1, chunk2))
+    blended_chunk = np.copy(chunk1)
+    for i in range(actual_overlap):
+        alpha = i / actual_overlap
+        blended_chunk[-actual_overlap + i] = (1 - alpha) * chunk1[-actual_overlap + i] + alpha * chunk2[i]
+    return np.vstack((blended_chunk, chunk2[actual_overlap:]))
+def process_audio_features(audio_features, model, device, config):
+    # Configuration settings
+    frame_length = config['frame_size']  # Number of frames per chunk (e.g., 64)
+    overlap = config.get('overlap', 32)  # Number of overlapping frames between chunks
+    num_features = audio_features.shape[1]
+    num_frames = audio_features.shape[0]
+    all_decoded_outputs = []
+    # Set model to evaluation mode
+    model.eval()
+    # Process chunks with the specified overlap
+    start_idx = 0
+    while start_idx < num_frames:
+        end_idx = min(start_idx + frame_length, num_frames)
+        # Select and pad chunk if needed
+        audio_chunk = audio_features[start_idx:end_idx]
+        audio_chunk = pad_audio_chunk(audio_chunk, frame_length, num_features)
+        # 🔥 Pass config to dynamically choose precision
+        decoded_outputs = decode_audio_chunk(audio_chunk, model, device, config)
+        decoded_outputs = decoded_outputs[:end_idx - start_idx]
+        # Blend with the last chunk if it exists
+        if all_decoded_outputs:
+            last_chunk = all_decoded_outputs.pop()
+            blended_chunk = blend_chunks(last_chunk, decoded_outputs, overlap)
+            all_decoded_outputs.append(blended_chunk)
+        else:
+            all_decoded_outputs.append(decoded_outputs)
+        # Move start index forward by (frame_length - overlap)
+        start_idx += frame_length - overlap
+    # Process any remaining frames to ensure total frame count matches input
+    current_length = sum(len(chunk) for chunk in all_decoded_outputs)
+    if current_length < num_frames:
+        remaining_frames = num_frames - current_length
+        final_chunk_start = num_frames - remaining_frames
+        audio_chunk = audio_features[final_chunk_start:num_frames]
+        audio_chunk = pad_audio_chunk(audio_chunk, frame_length, num_features)
+        decoded_outputs = decode_audio_chunk(audio_chunk, model, device, config)
+        all_decoded_outputs.append(decoded_outputs[:remaining_frames])
+    # Concatenate all chunks and trim to the original frame count
+    final_decoded_outputs = np.concatenate(all_decoded_outputs, axis=0)[:num_frames]
+    # Normalize or apply any post-processing
+    final_decoded_outputs = ensure_2d(final_decoded_outputs)
+    final_decoded_outputs[:, :61] /= 100  # Normalize specific columns
+    # Easing effect for smooth start (fades in first 0.2 seconds)
+    ease_duration_frames = min(int(0.1 * 60), final_decoded_outputs.shape[0])
+    easing_factors = np.linspace(0, 1, ease_duration_frames)[:, None]
+    final_decoded_outputs[:ease_duration_frames] *= easing_factors
+    # Zero out unnecessary columns (optional post-processing)
+    final_decoded_outputs = zero_columns(final_decoded_outputs)
+    return final_decoded_outputs
+def zero_columns(data):
+    columns_to_zero = [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
+    modified_data = np.copy(data)
+    modified_data[:, columns_to_zero] = 0
+    return modified_data

utils/config.py ADDED Viewed

	@@ -0,0 +1,12 @@

+config = {
+    'sr': 88200,
+    'frame_rate': 60,
+    'hidden_dim': 1024,
+    'n_layers': 8,
+    'num_heads': 16,
+    'dropout': 0.0,
+    'output_dim': 68, # if you trained your own, this should also be 61
+    'input_dim': 256,
+    'frame_size': 128,
+    'use_half_precision': False
+}

utils/generate_face_shapes.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# This software is licensed under a **dual-license model**
+# For individuals and businesses earning **under $1M per year**, this software is licensed under the **MIT License**
+# Businesses or organizations with **annual revenue of $1,000,000 or more** must obtain permission to use this software commercially.
+# generate_face_shapes.py
+import numpy as np
+from utils.audio.extraction.extract_features import extract_audio_features
+from utils.audio.processing.audio_processing import process_audio_features
+def generate_facial_data_from_bytes(audio_bytes, model, device, config):
+    audio_features, y = extract_audio_features(audio_bytes, from_bytes=True)
+    if audio_features is None or y is None:
+        return [], np.array([])
+    final_decoded_outputs = process_audio_features(audio_features, model, device, config)
+    return final_decoded_outputs

utils/model/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (17 kB). View file

utils/model/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (14.8 kB). View file

utils/model/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcf989405316b25c3d2cdfe389e41b50326284bd520126979eb602978ba842d3
+size 942037770

utils/model/model.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# This software is licensed under a **dual-license model**
+# For individuals and businesses earning **under $1M per year**, this software is licensed under the **MIT License**
+# Businesses or organizations with **annual revenue of $1,000,000 or more** must obtain permission to use this software commercially.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def load_model(model_path, config, device):
+    device = torch.device(device)
+    # Retrieve the half precision setting from the config
+    use_half_precision = config.get('use_half_precision', True)
+    # 🔥 NEW: Check for CUDA and cuDNN availability.
+    # If half precision is requested but CUDA or cuDNN are not available,
+    # fall back to full precision and update the config.
+    if use_half_precision:
+        if not (device.type == 'cuda' and torch.cuda.is_available() and torch.backends.cudnn.enabled):
+            print("⚠ Half-precision requested but CUDA or cuDNN not available. Falling back to full precision.")
+            use_half_precision = False
+            config['use_half_precision'] = False  # Update config to reflect the fallback
+    hidden_dim = config['hidden_dim']
+    n_layers = config['n_layers']
+    num_heads = config['num_heads']
+    encoder = Encoder(config['input_dim'], hidden_dim, n_layers, num_heads)
+    decoder = Decoder(config['output_dim'], hidden_dim, n_layers, num_heads)
+    model = Seq2Seq(encoder, decoder, device).to(device)
+    state_dict = torch.load(model_path, map_location=device)
+    model.load_state_dict(state_dict, strict=True)
+    # Convert the model to half precision if applicable
+    if use_half_precision and device.type == 'cuda':
+        model = model.to(torch.float16)
+        print("⚡ Model converted to float16 (half-precision).")
+    else:
+        print("🚫 Half-precision not applied (CPU or unsupported GPU or False set in config).")
+    model.eval()
+    return model
+# -------------------------------------------------------------------------------------------
+# Seq2Seq Model
+# -------------------------------------------------------------------------------------------
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder, device):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.device = device
+    def forward(self, src):
+        encoder_outputs = self.encoder(src)
+        output = self.decoder(encoder_outputs)
+        return output
+# -------------------------------------------------------------------------------------------
+# Rotary Positional Embedding (RoPE) for Local Attention
+# -------------------------------------------------------------------------------------------
+def apply_rope_qk(q, k, use_local_positional_encoding=True):
+    if not use_local_positional_encoding:
+        return q, k  # Return unmodified q, k if RoPE is disabled
+    batch_size, num_heads, seq_len, head_dim = q.size()
+    assert head_dim % 2 == 0, "head_dim must be even for RoPE"
+    position = torch.arange(seq_len, dtype=torch.float, device=q.device).unsqueeze(1)  # (seq_len, 1)
+    dim_indices = torch.arange(0, head_dim, 2, dtype=torch.float, device=q.device)  # (head_dim // 2)
+    div_term = torch.exp(-torch.log(torch.tensor(10000.0)) * dim_indices / head_dim)
+    angle = position * div_term  # (seq_len, head_dim // 2)
+    sin = torch.sin(angle).unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len, head_dim // 2)
+    cos = torch.cos(angle).unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len, head_dim // 2)
+    def rope_transform(x):
+        x1, x2 = x[..., ::2], x[..., 1::2]  # Split into even and odd parts
+        x_rope_even = x1 * cos - x2 * sin
+        x_rope_odd = x1 * sin + x2 * cos
+        return torch.stack([x_rope_even, x_rope_odd], dim=-1).flatten(-2)
+    q = rope_transform(q)
+    k = rope_transform(k)
+    return q, k
+# -------------------------------------------------------------------------------------------
+# Multi-Head Attention with RoPE
+# -------------------------------------------------------------------------------------------
+class MultiHeadAttention(nn.Module):
+    def __init__(self, hidden_dim, num_heads, dropout=0.0):
+        super(MultiHeadAttention, self).__init__()
+        assert hidden_dim % num_heads == 0, "Hidden dimension must be divisible by the number of heads"
+        self.num_heads = num_heads
+        self.head_dim = hidden_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.q_linear = nn.Linear(hidden_dim, hidden_dim)
+        self.k_linear = nn.Linear(hidden_dim, hidden_dim)
+        self.v_linear = nn.Linear(hidden_dim, hidden_dim)
+        self.out_linear = nn.Linear(hidden_dim, hidden_dim)
+        self.attn_dropout = nn.Dropout(dropout)
+        self.resid_dropout = nn.Dropout(dropout)
+        self.dropout = dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: Flash Attention requires PyTorch >= 2.0")
+    def forward(self, query, key, value, mask=None):
+        batch_size = query.size(0)
+        query = self.q_linear(query)
+        key = self.k_linear(key)
+        value = self.v_linear(value)
+        # Reshape to (B, H, L, D)
+        query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        key   = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K (if enabled)
+        query, key = apply_rope_qk(query, key)
+        if self.flash:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=mask, dropout_p=self.dropout if self.training else 0)
+            attn_weights = None
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) * self.scaling
+            if mask is not None:
+                scores = scores.masked_fill(mask == 0, float('-inf'))
+            attn_weights = F.softmax(scores, dim=-1)
+            attn_weights = self.attn_dropout(attn_weights)
+            attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)
+        output = self.out_linear(attn_output)
+        output = self.resid_dropout(output)
+        return output, attn_weights
+# -------------------------------------------------------------------------------------------
+# Feed-Forward Network
+# -------------------------------------------------------------------------------------------
+class FeedForwardNetwork(nn.Module):
+    def __init__(self, hidden_dim, dim_feedforward=2048, dropout=0.0):
+        super(FeedForwardNetwork, self).__init__()
+        self.linear1 = nn.Linear(hidden_dim, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, hidden_dim)
+    def forward(self, x):
+        x = self.linear1(x)
+        x = F.relu(x)
+        x = self.dropout(x)
+        x = self.linear2(x)
+        return x
+# -------------------------------------------------------------------------------------------
+# Custom Transformer Encoder/Decoder
+# -------------------------------------------------------------------------------------------
+class CustomTransformerEncoderLayer(nn.Module):
+    def __init__(self, hidden_dim, num_heads, dropout=0.0):
+        super(CustomTransformerEncoderLayer, self).__init__()
+        self.self_attn = MultiHeadAttention(hidden_dim, num_heads, dropout)
+        self.ffn = FeedForwardNetwork(hidden_dim, 4 * hidden_dim, dropout)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, src, mask=None):
+        src2, _ = self.self_attn(src, src, src, mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.ffn(src)
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+class CustomTransformerDecoderLayer(nn.Module):
+    def __init__(self, hidden_dim, num_heads, dropout=0.0):
+        super(CustomTransformerDecoderLayer, self).__init__()
+        self.self_attn = MultiHeadAttention(hidden_dim, num_heads, dropout)
+        self.multihead_attn = MultiHeadAttention(hidden_dim, num_heads, dropout)
+        self.ffn = FeedForwardNetwork(hidden_dim, 4 * hidden_dim, dropout)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.norm3 = nn.LayerNorm(hidden_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
+        tgt2, _ = self.self_attn(tgt, tgt, tgt, tgt_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2, _ = self.multihead_attn(tgt, memory, memory, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.ffn(tgt)
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+# -------------------------------------------------------------------------------------------
+# Encoder
+# -------------------------------------------------------------------------------------------
+class Encoder(nn.Module):
+    def __init__(self, input_dim, hidden_dim, n_layers, num_heads, dropout=0.0, use_norm=True):
+        super(Encoder, self).__init__()
+        self.embedding = nn.Linear(input_dim, hidden_dim)
+        # CHANGED: Removed global positional encoding as RoPE is used in MHA.
+        self.transformer_encoder = nn.ModuleList([
+            CustomTransformerEncoderLayer(hidden_dim, num_heads, dropout) for _ in range(n_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(hidden_dim) if use_norm else None
+    def forward(self, x):
+        x = self.embedding(x)
+        # CHANGED: Global positional encoding removed.
+        for layer in self.transformer_encoder:
+            x = layer(x)
+        if self.layer_norm:
+            x = self.layer_norm(x)
+        return x
+# -------------------------------------------------------------------------------------------
+# Decoder
+# -------------------------------------------------------------------------------------------
+class Decoder(nn.Module):
+    def __init__(self, output_dim, hidden_dim, n_layers, num_heads, dropout=0.0, use_norm=True):
+        super(Decoder, self).__init__()
+        self.transformer_decoder = nn.ModuleList([
+            CustomTransformerDecoderLayer(hidden_dim, num_heads, dropout) for _ in range(n_layers)
+        ])
+        self.fc_output = nn.Linear(hidden_dim, output_dim)
+        self.layer_norm = nn.LayerNorm(hidden_dim) if use_norm else None
+    def forward(self, encoder_outputs):
+        x = encoder_outputs
+        for layer in self.transformer_decoder:
+            x = layer(x, encoder_outputs)
+        if self.layer_norm:
+            x = self.layer_norm(x)
+        return self.fc_output(x)