Cloning from Github
This commit is contained in:
248
vibevoice/modular/configuration_vibevoice.py
Normal file
248
vibevoice/modular/configuration_vibevoice.py
Normal file
@@ -0,0 +1,248 @@
|
||||
""" VibeVoice_AcousticTokenizer model configuration"""
|
||||
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class VibeVoiceAcousticTokenizerConfig(PretrainedConfig):
|
||||
model_type = "vibevoice_acoustic_tokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
channels: int = 1,
|
||||
corpus_normalize: float = 0.0,
|
||||
causal: bool = True,
|
||||
vae_dim: int = 64,
|
||||
fix_std: float = 0.5,
|
||||
std_dist_type: str = 'gaussian',
|
||||
# common
|
||||
mixer_layer: str = 'depthwise_conv',
|
||||
conv_norm: str = 'none',
|
||||
pad_mode: str = 'constant',
|
||||
disable_last_norm: bool = True,
|
||||
layernorm: str = 'RMSNorm',
|
||||
layernorm_eps: float = 1e-5,
|
||||
layernorm_elementwise_affine: bool = True,
|
||||
conv_bias: bool = True,
|
||||
layer_scale_init_value: float = 1e-6,
|
||||
weight_init_value: float = 1e-2,
|
||||
# encoder specific
|
||||
encoder_n_filters: int = 32,
|
||||
encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2],
|
||||
encoder_depths: str = "3-3-3-3-3-3-8",
|
||||
# decoder specific
|
||||
decoder_n_filters: int = 32,
|
||||
decoder_ratios: Optional[List[int]] = None, # if None, same as encoder
|
||||
decoder_depths: Optional[str] = None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.channels = channels
|
||||
self.corpus_normalize = corpus_normalize
|
||||
self.causal = causal
|
||||
self.vae_dim = vae_dim
|
||||
self.fix_std = fix_std
|
||||
self.std_dist_type = std_dist_type
|
||||
|
||||
# common parameters
|
||||
self.conv_norm = conv_norm
|
||||
self.pad_mode = pad_mode
|
||||
self.layernorm_eps = layernorm_eps
|
||||
self.disable_last_norm = disable_last_norm
|
||||
self.layernorm = layernorm
|
||||
self.layernorm_elementwise_affine = layernorm_elementwise_affine
|
||||
self.conv_bias = conv_bias
|
||||
self.layer_scale_init_value = layer_scale_init_value
|
||||
self.weight_init_value = weight_init_value
|
||||
self.mixer_layer = mixer_layer
|
||||
|
||||
# encoder specific parameters
|
||||
self.encoder_n_filters = encoder_n_filters
|
||||
self.encoder_ratios = encoder_ratios
|
||||
self.encoder_depths = encoder_depths
|
||||
|
||||
# decoder specific parameters
|
||||
self.decoder_ratios = decoder_ratios if decoder_ratios is not None else encoder_ratios
|
||||
self.decoder_n_filters = decoder_n_filters
|
||||
self.decoder_depths = decoder_depths
|
||||
|
||||
|
||||
class VibeVoiceSemanticTokenizerConfig(PretrainedConfig):
|
||||
model_type = "vibevoice_semantic_tokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
channels: int = 1,
|
||||
corpus_normalize: float = 0.0,
|
||||
causal: bool = True,
|
||||
vae_dim: int = 64,
|
||||
fix_std: float = 0,
|
||||
std_dist_type: str = 'none',
|
||||
# common
|
||||
mixer_layer: str = 'depthwise_conv',
|
||||
conv_norm: str = 'none',
|
||||
pad_mode: str = 'constant',
|
||||
disable_last_norm: bool = True,
|
||||
layernorm: str = 'RMSNorm',
|
||||
layernorm_eps: float = 1e-5,
|
||||
layernorm_elementwise_affine: bool = True,
|
||||
conv_bias: bool = True,
|
||||
layer_scale_init_value: float = 1e-6,
|
||||
weight_init_value: float = 1e-2,
|
||||
# encoder specific
|
||||
encoder_n_filters: int = 32,
|
||||
encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2],
|
||||
encoder_depths: str = "3-3-3-3-3-3-8",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.channels = channels
|
||||
self.corpus_normalize = corpus_normalize
|
||||
self.causal = causal
|
||||
self.vae_dim = vae_dim
|
||||
self.fix_std = fix_std
|
||||
self.std_dist_type = std_dist_type
|
||||
|
||||
# common parameters
|
||||
self.conv_norm = conv_norm
|
||||
self.pad_mode = pad_mode
|
||||
self.layernorm_eps = layernorm_eps
|
||||
self.disable_last_norm = disable_last_norm
|
||||
self.layernorm = layernorm
|
||||
self.layernorm_elementwise_affine = layernorm_elementwise_affine
|
||||
self.conv_bias = conv_bias
|
||||
self.layer_scale_init_value = layer_scale_init_value
|
||||
self.weight_init_value = weight_init_value
|
||||
self.mixer_layer = mixer_layer
|
||||
|
||||
# encoder specific parameters
|
||||
self.encoder_n_filters = encoder_n_filters
|
||||
self.encoder_ratios = encoder_ratios
|
||||
self.encoder_depths = encoder_depths
|
||||
|
||||
|
||||
class VibeVoiceDiffusionHeadConfig(PretrainedConfig):
|
||||
model_type = "vibevoice_diffusion_head"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=768,
|
||||
head_layers=4,
|
||||
head_ffn_ratio=3.0,
|
||||
rms_norm_eps=1e-5,
|
||||
latent_size=64,
|
||||
speech_vae_dim=None,
|
||||
prediction_type="v_prediction",
|
||||
diffusion_type="ddpm",
|
||||
ddpm_num_steps=1000,
|
||||
ddpm_num_inference_steps=20,
|
||||
ddpm_beta_schedule="cosine",
|
||||
ddpm_batch_mul=4,
|
||||
**kwargs
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.head_layers = head_layers
|
||||
self.head_ffn_ratio = head_ffn_ratio
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.latent_size = latent_size
|
||||
self.speech_vae_dim = speech_vae_dim
|
||||
self.prediction_type = prediction_type
|
||||
self.diffusion_type = diffusion_type
|
||||
self.ddpm_num_steps = ddpm_num_steps
|
||||
self.ddpm_num_inference_steps = ddpm_num_inference_steps
|
||||
self.ddpm_beta_schedule = ddpm_beta_schedule
|
||||
self.ddpm_batch_mul = ddpm_batch_mul
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
class VibeVoiceConfig(PretrainedConfig):
|
||||
model_type = "vibevoice"
|
||||
is_composition = True
|
||||
sub_configs = {
|
||||
"acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig,
|
||||
"semantic_tokenizer_config": VibeVoiceSemanticTokenizerConfig,
|
||||
"decoder_config": Qwen2Config,
|
||||
"diffusion_head_config": VibeVoiceDiffusionHeadConfig,
|
||||
}
|
||||
# keys_to_ignore_at_inference = ["past_key_values"]
|
||||
# Default tensor parallel plan for base model `Qwen2`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
acoustic_tokenizer_config=None,
|
||||
semantic_tokenizer_config=None,
|
||||
decoder_config=None,
|
||||
diffusion_head_config=None,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
# kwargs["_attn_implementation"] = "flash_attention_2"
|
||||
kwargs["_attn_implementation_autoset"] = False
|
||||
|
||||
if acoustic_tokenizer_config is None:
|
||||
self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]()
|
||||
elif isinstance(acoustic_tokenizer_config, dict):
|
||||
acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer"
|
||||
self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config)
|
||||
elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig):
|
||||
# If an instance of the config class is provided
|
||||
self.acoustic_tokenizer_config = acoustic_tokenizer_config
|
||||
|
||||
if semantic_tokenizer_config is None:
|
||||
self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]()
|
||||
elif isinstance(semantic_tokenizer_config, dict):
|
||||
semantic_tokenizer_config["model_type"] = "vibevoice_semantic_tokenizer"
|
||||
self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config)
|
||||
elif isinstance(semantic_tokenizer_config, VibeVoiceSemanticTokenizerConfig):
|
||||
# If an instance of the config class is provided
|
||||
self.semantic_tokenizer_config = semantic_tokenizer_config
|
||||
|
||||
if decoder_config is None:
|
||||
self.decoder_config = self.sub_configs["decoder_config"]()
|
||||
elif isinstance(decoder_config, dict):
|
||||
# If a dictionary is provided, instantiate the config class with it
|
||||
# self.decoder_config = self.sub_configs["decoder_config"](**decoder_config)
|
||||
if decoder_config.get("model_type", '') == "qwen2":
|
||||
self.decoder_config = Qwen2Config(**decoder_config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}")
|
||||
elif isinstance(decoder_config, (Qwen2Config,)):
|
||||
# If an instance of the config class is provided
|
||||
self.decoder_config = decoder_config
|
||||
|
||||
if diffusion_head_config is None:
|
||||
self.diffusion_head_config = self.sub_configs["diffusion_head_config"]()
|
||||
elif isinstance(diffusion_head_config, dict):
|
||||
diffusion_head_config["model_type"] = "vibevoice_diffusion_head"
|
||||
self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config)
|
||||
elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig):
|
||||
# If an instance of the config class is provided
|
||||
self.diffusion_head_config = diffusion_head_config
|
||||
|
||||
# other parameters
|
||||
self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64)
|
||||
self.semantic_vae_dim = getattr(self.semantic_tokenizer_config, 'vae_dim', 128)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
__all__ = [
|
||||
"VibeVoiceAcousticTokenizerConfig",
|
||||
"VibeVoiceSemanticTokenizerConfig",
|
||||
"VibeVoiceDiffusionHeadConfig",
|
||||
"VibeVoiceConfig"
|
||||
]
|
||||
Reference in New Issue
Block a user