Cloning from Github

2025-10-20 18:58:25 +02:00
commit 9643e6494d
50 changed files with 9257 additions and 0 deletions
--- a/vibevoice/processor/vibevoice_tokenizer_processor.py
+++ b/vibevoice/processor/vibevoice_tokenizer_processor.py
@@ -0,0 +1,483 @@
+"""
+Processor class for VibeVoice models.
+"""
+
+import os
+import json
+import warnings
+from typing import List, Optional, Union, Dict, Any
+
+import numpy as np
+import torch
+
+from transformers.feature_extraction_utils import FeatureExtractionMixin
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class AudioNormalizer:
+    """
+    Audio normalization class for VibeVoice tokenizer.
+    
+    This class provides audio normalization to ensure consistent input levels
+    for the VibeVoice tokenizer while maintaining audio quality.
+    """
+    
+    def __init__(self, target_dB_FS: float = -25, eps: float = 1e-6):
+        """
+        Initialize the audio normalizer.
+        
+        Args:
+            target_dB_FS (float): Target dB FS level for the audio. Default: -25
+            eps (float): Small value to avoid division by zero. Default: 1e-6
+        """
+        self.target_dB_FS = target_dB_FS
+        self.eps = eps
+    
+    def tailor_dB_FS(self, audio: np.ndarray) -> tuple:
+        """
+        Adjust the audio to the target dB FS level.
+        
+        Args:
+            audio (np.ndarray): Input audio signal
+            
+        Returns:
+            tuple: (normalized_audio, rms, scalar)
+        """
+        rms = np.sqrt(np.mean(audio**2))
+        scalar = 10 ** (self.target_dB_FS / 20) / (rms + self.eps)
+        normalized_audio = audio * scalar
+        return normalized_audio, rms, scalar
+    
+    def avoid_clipping(self, audio: np.ndarray, scalar: Optional[float] = None) -> tuple:
+        """
+        Avoid clipping by scaling down if necessary.
+        
+        Args:
+            audio (np.ndarray): Input audio signal
+            scalar (float, optional): Explicit scaling factor
+            
+        Returns:
+            tuple: (normalized_audio, scalar)
+        """
+        if scalar is None:
+            max_val = np.max(np.abs(audio))
+            if max_val > 1.0:
+                scalar = max_val + self.eps
+            else:
+                scalar = 1.0
+        
+        return audio / scalar, scalar
+    
+    def __call__(self, audio: np.ndarray) -> np.ndarray:
+        """
+        Normalize the audio by adjusting to target dB FS and avoiding clipping.
+        
+        Args:
+            audio (np.ndarray): Input audio signal
+            
+        Returns:
+            np.ndarray: Normalized audio signal
+        """
+        # First adjust to target dB FS
+        audio, _, _ = self.tailor_dB_FS(audio)
+        # Then avoid clipping
+        audio, _ = self.avoid_clipping(audio)
+        return audio
+
+
+# Change from ProcessorMixin to FeatureExtractionMixin which is designed for single components
+class VibeVoiceTokenizerProcessor(FeatureExtractionMixin):
+    """
+    Processor for VibeVoice acoustic tokenizer models.
+    
+    This processor handles audio preprocessing for VibeVoice models, including:
+    - Audio format conversion (stereo to mono)
+    - Optional audio normalization
+    - Streaming support for infinite-length audio
+    
+    Args:
+        sampling_rate (int, optional): Expected sampling rate. Defaults to 24000.
+        normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
+        target_dB_FS (float, optional): Target dB FS for normalization. Defaults to -25.
+        eps (float, optional): Small value for numerical stability. Defaults to 1e-6.
+    """
+    model_input_names = ["input_features"]
+    
+    def __init__(
+        self,
+        sampling_rate: int = 24000,
+        normalize_audio: bool = True,
+        target_dB_FS: float = -25,
+        eps: float = 1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        
+        self.sampling_rate = sampling_rate
+        self.normalize_audio = normalize_audio
+        
+        # Initialize audio normalizer if needed
+        if self.normalize_audio:
+            self.normalizer = AudioNormalizer(target_dB_FS=target_dB_FS, eps=eps)
+        else:
+            self.normalizer = None
+        
+        # Save config
+        self.feature_extractor_dict = {
+            "sampling_rate": sampling_rate,
+            "normalize_audio": normalize_audio,
+            "target_dB_FS": target_dB_FS,
+            "eps": eps,
+        }
+    
+    def _ensure_mono(self, audio: np.ndarray) -> np.ndarray:
+        """
+        Convert stereo audio to mono if needed.
+        
+        Args:
+            audio (np.ndarray): Input audio array
+            
+        Returns:
+            np.ndarray: Mono audio array
+        """
+        if len(audio.shape) == 1:
+            return audio
+        elif len(audio.shape) == 2:
+            if audio.shape[0] == 2:  # (2, time)
+                return np.mean(audio, axis=0)
+            elif audio.shape[1] == 2:  # (time, 2)
+                return np.mean(audio, axis=1)
+            else:
+                # If one dimension is 1, squeeze it
+                if audio.shape[0] == 1:
+                    return audio.squeeze(0)
+                elif audio.shape[1] == 1:
+                    return audio.squeeze(1)
+                else:
+                    raise ValueError(f"Unexpected audio shape: {audio.shape}")
+        else:
+            raise ValueError(f"Audio should be 1D or 2D, got shape: {audio.shape}")
+    
+    def _process_single_audio(self, audio: Union[np.ndarray, List[float]]) -> np.ndarray:
+        """
+        Process a single audio array.
+        
+        Args:
+            audio: Single audio input
+            
+        Returns:
+            np.ndarray: Processed audio
+        """
+        # Convert to numpy array
+        if not isinstance(audio, np.ndarray):
+            audio = np.array(audio, dtype=np.float32)
+        else:
+            audio = audio.astype(np.float32)
+        
+        # Ensure mono
+        audio = self._ensure_mono(audio)
+        
+        # Normalize if requested
+        if self.normalize_audio and self.normalizer is not None:
+            audio = self.normalizer(audio)
+        
+        return audio
+    
+    def __call__(
+        self,
+        audio: Union[str, np.ndarray, List[float], List[np.ndarray], List[List[float]], List[str]] = None,
+        sampling_rate: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Process audio for VibeVoice models.
+        
+        Args:
+            audio: Audio input(s) to process. Can be:
+                - str: Path to audio file
+                - np.ndarray: Audio array
+                - List[float]: Audio as list of floats
+                - List[np.ndarray]: Batch of audio arrays
+                - List[str]: Batch of audio file paths
+            sampling_rate (int, optional): Sampling rate of the input audio
+            return_tensors (str, optional): Return format ('pt' for PyTorch, 'np' for NumPy)
+            
+        Returns:
+            dict: Processed audio inputs with keys:
+                - input_features: Audio tensor(s) ready for the model
+        """
+        if audio is None:
+            raise ValueError("Audio input is required")
+        
+        # Validate sampling rate
+        if sampling_rate is not None and sampling_rate != self.sampling_rate:
+            logger.warning(
+                f"Input sampling rate ({sampling_rate}) differs from expected "
+                f"sampling rate ({self.sampling_rate}). Please resample your audio."
+            )
+        
+        # Handle different input types
+        if isinstance(audio, str):
+            # Single audio file path
+            audio = self._load_audio_from_path(audio)
+            is_batched = False
+        elif isinstance(audio, list):
+            if len(audio) == 0:
+                raise ValueError("Empty audio list provided")
+            
+            # Check if it's a list of file paths
+            if all(isinstance(item, str) for item in audio):
+                # Batch of audio file paths
+                audio = [self._load_audio_from_path(path) for path in audio]
+                is_batched = True
+            else:
+                # Check if it's batched audio arrays
+                is_batched = isinstance(audio[0], (np.ndarray, list))
+        else:
+            # Single audio array or list
+            is_batched = False
+        
+        # Process audio
+        if is_batched:
+            processed_audio = [self._process_single_audio(a) for a in audio]
+        else:
+            processed_audio = [self._process_single_audio(audio)]
+        
+        # Convert to tensors if requested
+        if return_tensors == "pt":
+            if len(processed_audio) == 1:
+                # Create a proper batch dimension (B, T)
+                input_features = torch.from_numpy(processed_audio[0]).unsqueeze(0).unsqueeze(1)
+            else:
+                # For batched input with different lengths, create a batch properly
+                input_features = torch.stack([torch.from_numpy(a) for a in processed_audio]).unsqueeze(1)
+        elif return_tensors == "np":
+            if len(processed_audio) == 1:
+                input_features = processed_audio[0][np.newaxis, np.newaxis, :]
+            else:
+                input_features = np.stack(processed_audio)[:, np.newaxis, :]
+        else:
+            input_features = processed_audio[0] if len(processed_audio) == 1 else processed_audio
+        
+        outputs = {
+            "audio": input_features,  # Use "audio" instead of "input_features"
+        }
+        
+        return outputs
+
+    def _load_audio_from_path(self, audio_path: str) -> np.ndarray:
+        """
+        Load audio from file path.
+        
+        Args:
+            audio_path (str): Path to audio file
+            
+        Returns:
+            np.ndarray: Loaded audio array
+        """
+        # Get file extension to determine loading method
+        file_ext = os.path.splitext(audio_path)[1].lower()
+        
+        if file_ext in ['.wav', '.mp3', '.flac', '.m4a', '.ogg']:
+            # Audio file - use librosa
+            import librosa
+            audio_array, sr = librosa.load(
+                audio_path, 
+                sr=self.sampling_rate, 
+                mono=True
+            )
+            return audio_array
+        elif file_ext == '.pt':
+            # PyTorch tensor file
+            audio_tensor = torch.load(audio_path, map_location='cpu').squeeze()
+            if isinstance(audio_tensor, torch.Tensor):
+                audio_array = audio_tensor.numpy()
+            else:
+                audio_array = np.array(audio_tensor)
+            return audio_array.astype(np.float32)
+        elif file_ext == '.npy':
+            # NumPy file
+            audio_array = np.load(audio_path)
+            return audio_array.astype(np.float32)
+        else:
+            raise ValueError(
+                f"Unsupported file format: {file_ext}. "
+                f"Supported formats: .wav, .mp3, .flac, .m4a, .ogg, .pt, .npy, .npz"
+            )
+    
+    def preprocess_audio(
+        self, 
+        audio_path_or_array: Union[str, np.ndarray],
+        normalize: Optional[bool] = None,
+    ) -> np.ndarray:
+        """
+        Convenience method to preprocess audio from file path or array.
+        This method is kept for backward compatibility but __call__ is recommended.
+        
+        Args:
+            audio_path_or_array: Path to audio file or numpy array
+            normalize: Whether to normalize (overrides default setting)
+            
+        Returns:
+            np.ndarray: Preprocessed audio array
+        """
+        if isinstance(audio_path_or_array, str):
+            audio_array = self._load_audio_from_path(audio_path_or_array)
+        else:
+            audio_array = np.array(audio_path_or_array, dtype=np.float32)
+        
+        # Override normalization setting if specified
+        original_normalize = self.normalize_audio
+        if normalize is not None:
+            self.normalize_audio = normalize
+        
+        try:
+            processed = self._process_single_audio(audio_array)
+        finally:
+            # Restore original setting
+            self.normalize_audio = original_normalize
+        
+        return processed
+    
+    # Override to_dict method for configuration saving
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the object to a dict containing all attributes needed for serialization.
+        """
+        return self.feature_extractor_dict
+
+    def save_audio(
+        self,
+        audio: Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]],
+        output_path: str = "output.wav",
+        sampling_rate: Optional[int] = None,
+        normalize: bool = False,
+        batch_prefix: str = "audio_",
+    ):
+        """
+        Save audio data to WAV file(s).
+        
+        Args:
+            audio: Audio data to save. Can be:
+                - torch.Tensor: PyTorch tensor with shape (B, C, T) or (B, T) or (T)
+                - np.ndarray: NumPy array with shape (B, C, T) or (B, T) or (T)
+                - List of tensors or arrays
+            output_path: Path where to save the audio. If saving multiple files,
+                this is treated as a directory and individual files will be saved inside.
+            sampling_rate: Sampling rate for the saved audio. Defaults to the processor's rate.
+            normalize: Whether to normalize audio before saving.
+            batch_prefix: Prefix for batch files when saving multiple audios.
+                
+        Returns:
+            List[str]: Paths to the saved audio files.
+        """
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        
+        try:
+            import soundfile as sf
+        except ImportError:
+            raise ImportError(
+                "soundfile is required to save audio files. "
+                "Install it with: pip install soundfile"
+            )
+        
+        # Ensure audio is in the right format
+        if isinstance(audio, torch.Tensor):
+            # Convert PyTorch tensor to numpy
+            audio_np = audio.float().detach().cpu().numpy()
+        elif isinstance(audio, np.ndarray):
+            audio_np = audio
+        elif isinstance(audio, list):
+            # Handle list of tensors or arrays
+            if all(isinstance(a, torch.Tensor) for a in audio):
+                audio_np = [a.float().detach().cpu().numpy() for a in audio]
+            else:
+                audio_np = audio
+        else:
+            raise ValueError(f"Unsupported audio type: {type(audio)}")
+        
+        saved_paths = []
+        
+        # Handle based on shape or type
+        if isinstance(audio_np, list):
+            # Multiple separate audios to save
+            output_dir = output_path
+            
+            # Ensure output directory exists
+            os.makedirs(output_dir, exist_ok=True)
+            
+            # Save each audio
+            for i, audio_item in enumerate(audio_np):
+                audio_item = self._prepare_audio_for_save(audio_item, normalize)
+                file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav")
+                sf.write(file_path, audio_item, sampling_rate)
+                saved_paths.append(file_path)
+                
+        else:
+            # Handle different dimensions
+            if len(audio_np.shape) >= 3:  # (B, C, T) or similar
+                # Get batch size
+                batch_size = audio_np.shape[0]
+                
+                if batch_size > 1:
+                    # Multiple audios in a batch
+                    output_dir = output_path
+                    
+                    # Ensure output directory exists
+                    os.makedirs(output_dir, exist_ok=True)
+                    
+                    # Save each audio in the batch
+                    for i in range(batch_size):
+                        # Extract single audio and remove channel dim if present
+                        single_audio = audio_np[i]
+                        if len(single_audio.shape) > 1:
+                            if single_audio.shape[0] == 1:  # (1, T)
+                                single_audio = single_audio.squeeze(0)
+                        
+                        single_audio = self._prepare_audio_for_save(single_audio, normalize)
+                        file_path = os.path.join(output_dir, f"{batch_prefix}{i}.wav")
+                        sf.write(file_path, single_audio, sampling_rate)
+                        saved_paths.append(file_path)
+                else:
+                    # Single audio with batch and channel dims
+                    audio_item = audio_np.squeeze()  # Remove batch and channel dimensions
+                    audio_item = self._prepare_audio_for_save(audio_item, normalize)
+                    sf.write(output_path, audio_item, sampling_rate)
+                    saved_paths.append(output_path)
+            else:
+                # Single audio without batch dimension
+                audio_item = self._prepare_audio_for_save(audio_np, normalize)
+                sf.write(output_path, audio_item, sampling_rate)
+                saved_paths.append(output_path)
+        
+        return saved_paths
+
+    def _prepare_audio_for_save(self, audio: np.ndarray, normalize: bool) -> np.ndarray:
+        """
+        Prepare audio for saving by ensuring it's the right shape and optionally normalizing.
+        
+        Args:
+            audio: Audio data as numpy array
+            normalize: Whether to normalize audio
+            
+        Returns:
+            np.ndarray: Processed audio ready for saving
+        """
+        # Ensure right dimensionality
+        if len(audio.shape) > 1 and audio.shape[0] == 1:  # (1, T)
+            audio = audio.squeeze(0)
+        
+        # Normalize if requested
+        if normalize:
+            max_val = np.abs(audio).max()
+            if max_val > 0:
+                audio = audio / max_val
+        
+        return audio
+
+
+__all__ = ["VibeVoiceTokenizerProcessor", "AudioNormalizer"]