Cloning from Github
This commit is contained in:
214
vibevoice/modular/modular_vibevoice_text_tokenizer.py
Normal file
214
vibevoice/modular/modular_vibevoice_text_tokenizer.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""Tokenization classes for vibevoice."""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from transformers.utils import logging
|
||||
from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
|
||||
from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class VibeVoiceTextTokenizer(Qwen2Tokenizer):
|
||||
"""
|
||||
Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
Path to the vocabulary file.
|
||||
merges_file (`str`):
|
||||
Path to the merges file.
|
||||
errors (`str`, *optional*, defaults to `"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8.
|
||||
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The unknown token.
|
||||
bos_token (`str`, *optional*):
|
||||
The beginning of sequence token. Not used for vibevoice.
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The token used for padding.
|
||||
add_special_tokens (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to add special tokens when encoding.
|
||||
"""
|
||||
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
errors="replace",
|
||||
unk_token="<|endoftext|>",
|
||||
bos_token=None,
|
||||
eos_token="<|endoftext|>",
|
||||
pad_token="<|endoftext|>",
|
||||
add_prefix_space=False,
|
||||
add_special_tokens=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
errors=errors,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
add_special_tokens=add_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Add VibeVoice-specific special tokens
|
||||
self._add_vibevoice_special_tokens()
|
||||
|
||||
def _add_vibevoice_special_tokens(self):
|
||||
"""Add VibeVoice-specific special tokens."""
|
||||
special_tokens = {
|
||||
"additional_special_tokens": [
|
||||
"<|vision_start|>", # Speech start (reusing vision tokens)
|
||||
"<|vision_end|>", # Speech end
|
||||
"<|vision_pad|>", # Speech diffusion pad
|
||||
]
|
||||
}
|
||||
num_added = self.add_special_tokens(special_tokens)
|
||||
|
||||
# Cache special token IDs
|
||||
self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>")
|
||||
self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>")
|
||||
self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>")
|
||||
|
||||
self._eos_id = self.convert_tokens_to_ids('<|endoftext|>')
|
||||
|
||||
return num_added
|
||||
|
||||
@property
|
||||
def eos_id(self) -> int:
|
||||
"""Id of the end of sequence token."""
|
||||
return self._eos_id
|
||||
|
||||
@property
|
||||
def speech_start_id(self) -> int:
|
||||
"""Id of the speech start token."""
|
||||
return self._speech_start_id
|
||||
|
||||
@property
|
||||
def speech_end_id(self) -> int:
|
||||
"""Id of the speech end token."""
|
||||
return self._speech_end_id
|
||||
|
||||
@property
|
||||
def speech_diffusion_id(self) -> int:
|
||||
"""Id of the speech diffusion token."""
|
||||
return self._speech_diffusion_id
|
||||
|
||||
@property
|
||||
def pad_id(self) -> int:
|
||||
"""Id used for padding (returns -100 for loss masking)."""
|
||||
return -100
|
||||
|
||||
|
||||
class VibeVoiceTextTokenizerFast(Qwen2TokenizerFast):
|
||||
"""
|
||||
Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library).
|
||||
Based on the Qwen2 tokenizer with additional special tokens for speech.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`, *optional*):
|
||||
Path to the vocabulary file.
|
||||
merges_file (`str`, *optional*):
|
||||
Path to the merges file.
|
||||
tokenizer_file (`str`, *optional*):
|
||||
Path to [tokenizers](https://github.com/huggingface/tokenizers) file.
|
||||
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The unknown token.
|
||||
bos_token (`str`, *optional*):
|
||||
The beginning of sequence token. Not used for vibevoice.
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The token used for padding.
|
||||
"""
|
||||
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file=None,
|
||||
merges_file=None,
|
||||
tokenizer_file=None,
|
||||
unk_token="<|endoftext|>",
|
||||
bos_token=None,
|
||||
eos_token="<|endoftext|>",
|
||||
pad_token="<|endoftext|>",
|
||||
add_prefix_space=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
unk_token=unk_token,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
pad_token=pad_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Add VibeVoice-specific special tokens
|
||||
self._add_vibevoice_special_tokens()
|
||||
|
||||
def _add_vibevoice_special_tokens(self):
|
||||
"""Add VibeVoice-specific special tokens."""
|
||||
special_tokens = {
|
||||
"additional_special_tokens": [
|
||||
"<|vision_start|>", # Speech start (reusing vision tokens)
|
||||
"<|vision_end|>", # Speech end
|
||||
"<|vision_pad|>", # Speech diffusion pad
|
||||
]
|
||||
}
|
||||
num_added = self.add_special_tokens(special_tokens)
|
||||
|
||||
# Cache special token IDs
|
||||
self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>")
|
||||
self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>")
|
||||
self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>")
|
||||
|
||||
# self._eos_id = self.convert_tokens_to_ids('<|endoftext|>')
|
||||
self._eos_id = self.eos_token_id # qwen2 / qwen3
|
||||
self._pad_id = self.convert_tokens_to_ids('<|image_pad|>')
|
||||
|
||||
return num_added
|
||||
|
||||
@property
|
||||
def eos_id(self) -> int:
|
||||
"""Id of the end of sequence token."""
|
||||
return self._eos_id
|
||||
|
||||
@property
|
||||
def speech_start_id(self) -> int:
|
||||
"""Id of the speech start token."""
|
||||
return self._speech_start_id
|
||||
|
||||
@property
|
||||
def speech_end_id(self) -> int:
|
||||
"""Id of the speech end token."""
|
||||
return self._speech_end_id
|
||||
|
||||
@property
|
||||
def speech_diffusion_id(self) -> int:
|
||||
"""Id of the speech diffusion token."""
|
||||
return self._speech_diffusion_id
|
||||
|
||||
@property
|
||||
def pad_id(self) -> int:
|
||||
"""Id used for padding (returns -100 for loss masking)."""
|
||||
return self._pad_id
|
||||
|
||||
|
||||
__all__ = [
|
||||
"VibeVoiceTextTokenizer",
|
||||
"VibeVoiceTextTokenizerFast",
|
||||
]
|
||||
Reference in New Issue
Block a user