2
0

Cloning from Github

This commit is contained in:
2025-10-20 18:58:25 +02:00
commit 9643e6494d
50 changed files with 9257 additions and 0 deletions

View File

@@ -0,0 +1,483 @@
"""
Processor class for VibeVoice models.
"""
import os
import json
import warnings
from typing import List, Optional, Union, Dict, Any
import numpy as np
import torch
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.utils import logging
logger = logging.get_logger(__name__)
class AudioNormalizer:
"""
Audio normalization class for VibeVoice tokenizer.
This class provides audio normalization to ensure consistent input levels
for the VibeVoice tokenizer while maintaining audio quality.
"""
def __init__(self, target_dB_FS: float = -25, eps: float = 1e-6):
"""
Initialize the audio normalizer.
Args:
target_dB_FS (float): Target dB FS level for the audio. Default: -25
eps (float): Small value to avoid division by zero. Default: 1e-6
"""
self.target_dB_FS = target_dB_FS
self.eps = eps
def tailor_dB_FS(self, audio: np.ndarray) -> tuple:
"""
Adjust the audio to the target dB FS level.
Args:
audio (np.ndarray): Input audio signal
Returns:
tuple: (normalized_audio, rms, scalar)
"""
rms = np.sqrt(np.mean(audio**2))
scalar = 10 ** (self.target_dB_FS / 20) / (rms + self.eps)
normalized_audio = audio * scalar
return normalized_audio, rms, scalar
def avoid_clipping(self, audio: np.ndarray, scalar: Optional[float] = None) -> tuple:
"""
Avoid clipping by scaling down if necessary.
Args:
audio (np.ndarray): Input audio signal
scalar (float, optional): Explicit scaling factor
Returns:
tuple: (normalized_audio, scalar)
"""
if scalar is None:
max_val = np.max(np.abs(audio))
if max_val > 1.0:
scalar = max_val + self.eps
else:
scalar = 1.0
return audio / scalar, scalar
def __call__(self, audio: np.ndarray) -> np.ndarray:
"""
Normalize the audio by adjusting to target dB FS and avoiding clipping.
Args:
audio (np.ndarray): Input audio signal
Returns:
np.ndarray: Normalized audio signal
"""
# First adjust to target dB FS
audio, _, _ = self.tailor_dB_FS(audio)
# Then avoid clipping
audio, _ = self.avoid_clipping(audio)
return audio
# Change from ProcessorMixin to FeatureExtractionMixin which is designed for single components
class VibeVoiceTokenizerProcessor(FeatureExtractionMixin):
"""
Processor for VibeVoice acoustic tokenizer models.
This processor handles audio preprocessing for VibeVoice models, including:
- Audio format conversion (stereo to mono)
- Optional audio normalization
- Streaming support for infinite-length audio
Args:
sampling_rate (int, optional): Expected sampling rate. Defaults to 24000.
normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
target_dB_FS (float, optional): Target dB FS for normalization. Defaults to -25.
eps (float, optional): Small value for numerical stability. Defaults to 1e-6.
"""
model_input_names = ["input_features"]
def __init__(
self,
sampling_rate: int = 24000,
normalize_audio: bool = True,
target_dB_FS: float = -25,
eps: float = 1e-6,
**kwargs,
):
super().__init__(**kwargs)
self.sampling_rate = sampling_rate
self.normalize_audio = normalize_audio
# Initialize audio normalizer if needed
if self.normalize_audio:
self.normalizer = AudioNormalizer(target_dB_FS=target_dB_FS, eps=eps)
else:
self.normalizer = None
# Save config
self.feature_extractor_dict = {
"sampling_rate": sampling_rate,
"normalize_audio": normalize_audio,
"target_dB_FS": target_dB_FS,
"eps": eps,
}
def _ensure_mono(self, audio: np.ndarray) -> np.ndarray:
"""
Convert stereo audio to mono if needed.
Args:
audio (np.ndarray): Input audio array
Returns:
np.ndarray: Mono audio array
"""
if len(audio.shape) == 1:
return audio
elif len(audio.shape) == 2:
if audio.shape[0] == 2: # (2, time)
return np.mean(audio, axis=0)
elif audio.shape[1] == 2: # (time, 2)
return np.mean(audio, axis=1)
else:
# If one dimension is 1, squeeze it
if audio.shape[0] == 1:
return audio.squeeze(0)
elif audio.shape[1] == 1:
return audio.squeeze(1)
else:
raise ValueError(f"Unexpected audio shape: {audio.shape}")
else:
raise ValueError(f"Audio should be 1D or 2D, got shape: {audio.shape}")
def _process_single_audio(self, audio: Union[np.ndarray, List[float]]) -> np.ndarray:
"""
Process a single audio array.
Args:
audio: Single audio input
Returns:
np.ndarray: Processed audio
"""
# Convert to numpy array
if not isinstance(audio, np.ndarray):
audio = np.array(audio, dtype=np.float32)
else:
audio = audio.astype(np.float32)
# Ensure mono
audio = self._ensure_mono(audio)
# Normalize if requested
if self.normalize_audio and self.normalizer is not None:
audio = self.normalizer(audio)
return audio
def __call__(
self,
audio: Union[str, np.ndarray, List[float], List[np.ndarray], List[List[float]], List[str]] = None,
sampling_rate: Optional[int] = None,
return_tensors: Optional[str] = None,
**kwargs,
):
"""
Process audio for VibeVoice models.
Args:
audio: Audio input(s) to process. Can be:
- str: Path to audio file
- np.ndarray: Audio array
- List[float]: Audio as list of floats
- List[np.ndarray]: Batch of audio arrays
- List[str]: Batch of audio file paths
sampling_rate (int, optional): Sampling rate of the input audio
return_tensors (str, optional): Return format ('pt' for PyTorch, 'np' for NumPy)
Returns:
dict: Processed audio inputs with keys:
- input_features: Audio tensor(s) ready for the model
"""
if audio is None:
raise ValueError("Audio input is required")
# Validate sampling rate
if sampling_rate is not None and sampling_rate != self.sampling_rate:
logger.warning(
f"Input sampling rate ({sampling_rate}) differs from expected "
f"sampling rate ({self.sampling_rate}). Please resample your audio."
)
# Handle different input types
if isinstance(audio, str):
# Single audio file path
audio = self._load_audio_from_path(audio)
is_batched = False
elif isinstance(audio, list):
if len(audio) == 0:
raise ValueError("Empty audio list provided")
# Check if it's a list of file paths
if all(isinstance(item, str) for item in audio):
# Batch of audio file paths
audio = [self._load_audio_from_path(path) for path in audio]
is_batched = True
else:
# Check if it's batched audio arrays
is_batched = isinstance(audio[0], (np.ndarray, list))
else:
# Single audio array or list
is_batched = False
# Process audio
if is_batched:
processed_audio = [self._process_single_audio(a) for a in audio]
else:
processed_audio = [self._process_single_audio(audio)]
# Convert to tensors if requested
if return_tensors == "pt":
if len(processed_audio) == 1:
# Create a proper batch dimension (B, T)
input_features = torch.from_numpy(processed_audio[0]).unsqueeze(0).unsqueeze(1)
else:
# For batched input with different lengths, create a batch properly
input_features = torch.stack([torch.from_numpy(a) for a in processed_audio]).unsqueeze(1)
elif return_tensors == "np":
if len(processed_audio) == 1:
input_features = processed_audio[0][np.newaxis, np.newaxis, :]
else:
input_features = np.stack(processed_audio)[:, np.newaxis, :]
else:
input_features = processed_audio[0] if len(processed_audio) == 1 else processed_audio
outputs = {
"audio": input_features, # Use "audio" instead of "input_features"
}
return outputs
def _load_audio_from_path(self, audio_path: str) -> np.ndarray:
"""
Load audio from file path.
Args:
audio_path (str): Path to audio file
Returns:
np.ndarray: Loaded audio array
"""
# Get file extension to determine loading method
file_ext = os.path.splitext(audio_path)[1].lower()
if file_ext in ['.wav', '.mp3', '.flac', '.m4a', '.ogg']:
# Audio file - use librosa
import librosa
audio_array, sr = librosa.load(
audio_path,
sr=self.sampling_rate,
mono=True
)
return audio_array
elif file_ext == '.pt':
# PyTorch tensor file
audio_tensor = torch.load(audio_path, map_location='cpu').squeeze()
if isinstance(audio_tensor, torch.Tensor):
audio_array = audio_tensor.numpy()
else:
audio_array = np.array(audio_tensor)
return audio_array.astype(np.float32)
elif file_ext == '.npy':
# NumPy file
audio_array = np.load(audio_path)
return audio_array.astype(np.float32)
else:
raise ValueError(
f"Unsupported file format: {file_ext}. "
f"Supported formats: .wav, .mp3, .flac, .m4a, .ogg, .pt, .npy, .npz"
)
def preprocess_audio(
self,
audio_path_or_array: Union[str, np.ndarray],
normalize: Optional[bool] = None,
) -> np.ndarray:
"""
Convenience method to preprocess audio from file path or array.
This method is kept for backward compatibility but __call__ is recommended.
Args:
audio_path_or_array: Path to audio file or numpy array
normalize: Whether to normalize (overrides default setting)
Returns:
np.ndarray: Preprocessed audio array
"""
if isinstance(audio_path_or_array, str):
audio_array = self._load_audio_from_path(audio_path_or_array)
else:
audio_array = np.array(audio_path_or_array, dtype=np.float32)
# Override normalization setting if specified
original_normalize = self.normalize_audio
if normalize is not None:
self.normalize_audio = normalize
try:
processed = self._process_single_audio(audio_array)
finally:
# Restore original setting
self.normalize_audio = original_normalize
return processed
# Override to_dict method for configuration saving
def to_dict(self) -> Dict[str, Any]:
"""
Convert the object to a dict containing all attributes needed for serialization.
"""
return self.feature_extractor_dict
def save_audio(
self,
audio: Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]],
output_path: str = "output.wav",
sampling_rate: Optional[int] = None,
normalize: bool = False,
batch_prefix: str = "audio_",
):
"""
Save audio data to WAV file(s).
Args:
audio: Audio data to save. Can be:
- torch.Tensor: PyTorch tensor with shape (B, C, T) or (B, T) or (T)
- np.ndarray: NumPy array with shape (B, C, T) or (B, T) or (T)
- List of tensors or arrays
output_path: Path where to save the audio. If saving multiple files,
this is treated as a directory and individual files will be saved inside.
sampling_rate: Sampling rate for the saved audio. Defaults to the processor's rate.
normalize: Whether to normalize audio before saving.
batch_prefix: Prefix for batch files when saving multiple audios.
Returns:
List[str]: Paths to the saved audio files.
"""
if sampling_rate is None:
sampling_rate = self.sampling_rate
try:
import soundfile as sf
except ImportError:
raise ImportError(
"soundfile is required to save audio files. "
"Install it with: pip install soundfile"
)
# Ensure audio is in the right format
if isinstance(audio, torch.Tensor):
# Convert PyTorch tensor to numpy
audio_np = audio.float().detach().cpu().numpy()
elif isinstance(audio, np.ndarray):
audio_np = audio
elif isinstance(audio, list):
# Handle list of tensors or arrays
if all(isinstance(a, torch.Tensor) for a in audio):
audio_np = [a.float().detach().cpu().numpy() for a in audio]
else:
audio_np = audio
else:
raise ValueError(f"Unsupported audio type: {type(audio)}")
saved_paths = []
# Handle based on shape or type
if isinstance(audio_np, list):
# Multiple separate audios to save
output_dir = output_path
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Save each audio
for i, audio_item in enumerate(audio_np):
audio_item = self._prepare_audio_for_save(audio_item, normalize)
file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav")
sf.write(file_path, audio_item, sampling_rate)
saved_paths.append(file_path)
else:
# Handle different dimensions
if len(audio_np.shape) >= 3: # (B, C, T) or similar
# Get batch size
batch_size = audio_np.shape[0]
if batch_size > 1:
# Multiple audios in a batch
output_dir = output_path
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Save each audio in the batch
for i in range(batch_size):
# Extract single audio and remove channel dim if present
single_audio = audio_np[i]
if len(single_audio.shape) > 1:
if single_audio.shape[0] == 1: # (1, T)
single_audio = single_audio.squeeze(0)
single_audio = self._prepare_audio_for_save(single_audio, normalize)
file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav")
sf.write(file_path, single_audio, sampling_rate)
saved_paths.append(file_path)
else:
# Single audio with batch and channel dims
audio_item = audio_np.squeeze() # Remove batch and channel dimensions
audio_item = self._prepare_audio_for_save(audio_item, normalize)
sf.write(output_path, audio_item, sampling_rate)
saved_paths.append(output_path)
else:
# Single audio without batch dimension
audio_item = self._prepare_audio_for_save(audio_np, normalize)
sf.write(output_path, audio_item, sampling_rate)
saved_paths.append(output_path)
return saved_paths
def _prepare_audio_for_save(self, audio: np.ndarray, normalize: bool) -> np.ndarray:
"""
Prepare audio for saving by ensuring it's the right shape and optionally normalizing.
Args:
audio: Audio data as numpy array
normalize: Whether to normalize audio
Returns:
np.ndarray: Processed audio ready for saving
"""
# Ensure right dimensionality
if len(audio.shape) > 1 and audio.shape[0] == 1: # (1, T)
audio = audio.squeeze(0)
# Normalize if requested
if normalize:
max_val = np.abs(audio).max()
if max_val > 0:
audio = audio / max_val
return audio
__all__ = ["VibeVoiceTokenizerProcessor", "AudioNormalizer"]