Skip to content

vllm.reasoning.minimax_m2_reasoning_parser

logger module-attribute

logger = init_logger(__name__)

MiniMaxM2AppendThinkReasoningParser

Bases: ReasoningParser

Reasoning parser for MiniMax M2 model.

Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
    """
    Reasoning parser for MiniMax M2 model.
    """

    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)
        self.end_token_id = self.vocab.get("</think>")

    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        end_token_id = self.end_token_id
        return any(input_id == end_token_id for input_id in reversed(input_ids))

    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        return input_ids

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        if len(previous_token_ids) == 0:
            delta_text = "<think>" + delta_text
        return DeltaMessage(content=delta_text)

    def extract_reasoning(
        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
    ) -> tuple[str | None, str | None]:
        return None, "<think>" + model_output

end_token_id instance-attribute

end_token_id = get('</think>')

__init__

__init__(tokenizer: TokenizerLike, *args, **kwargs)
Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
    super().__init__(tokenizer, *args, **kwargs)
    self.end_token_id = self.vocab.get("</think>")

extract_content_ids

extract_content_ids(input_ids: list[int]) -> list[int]
Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
    return input_ids

extract_reasoning

extract_reasoning(
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]
Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
def extract_reasoning(
    self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
    return None, "<think>" + model_output

extract_reasoning_streaming

extract_reasoning_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None
Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
def extract_reasoning_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
    if len(previous_token_ids) == 0:
        delta_text = "<think>" + delta_text
    return DeltaMessage(content=delta_text)

is_reasoning_end

is_reasoning_end(input_ids: list[int]) -> bool
Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
def is_reasoning_end(self, input_ids: list[int]) -> bool:
    end_token_id = self.end_token_id
    return any(input_id == end_token_id for input_id in reversed(input_ids))

MiniMaxM2ReasoningParser

Bases: BaseThinkingReasoningParser

Reasoning parser for MiniMax M2 model.

MiniMax M2 models don't generate start token, only end token. All content before is reasoning, content after is the actual response.

Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
    """
    Reasoning parser for MiniMax M2 model.

    MiniMax M2 models don't generate <think> start token, only </think> end
    token. All content before </think> is reasoning, content after is the
    actual response.
    """

    @property
    def start_token(self) -> str:
        """The token that starts reasoning content."""
        return "<think>"

    @property
    def end_token(self) -> str:
        """The token that ends reasoning content."""
        return "</think>"

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extract reasoning content from a delta message for streaming.

        MiniMax M2 models don't generate <think> start token, so we assume
        all content is reasoning until we encounter the </think> end token.
        """
        # Skip single end token
        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
            return None

        # Check if end token has already appeared in previous tokens
        # meaning we're past the reasoning phase
        if self.end_token_id in previous_token_ids:
            # We're past the reasoning phase, this is content
            return DeltaMessage(content=delta_text)

        # Check if end token is in delta tokens
        if self.end_token_id in delta_token_ids:
            # End token in delta, split reasoning and content
            end_index = delta_text.find(self.end_token)
            reasoning = delta_text[:end_index]
            content = delta_text[end_index + len(self.end_token) :]
            return DeltaMessage(
                reasoning=reasoning if reasoning else None,
                content=content if content else None,
            )

        # No end token yet, all content is reasoning
        return DeltaMessage(reasoning=delta_text)

end_token property

end_token: str

The token that ends reasoning content.

start_token property

start_token: str

The token that starts reasoning content.

extract_reasoning_streaming

extract_reasoning_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None

Extract reasoning content from a delta message for streaming.

MiniMax M2 models don't generate start token, so we assume all content is reasoning until we encounter the end token.

Source code in vllm/reasoning/minimax_m2_reasoning_parser.py
def extract_reasoning_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
    """
    Extract reasoning content from a delta message for streaming.

    MiniMax M2 models don't generate <think> start token, so we assume
    all content is reasoning until we encounter the </think> end token.
    """
    # Skip single end token
    if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
        return None

    # Check if end token has already appeared in previous tokens
    # meaning we're past the reasoning phase
    if self.end_token_id in previous_token_ids:
        # We're past the reasoning phase, this is content
        return DeltaMessage(content=delta_text)

    # Check if end token is in delta tokens
    if self.end_token_id in delta_token_ids:
        # End token in delta, split reasoning and content
        end_index = delta_text.find(self.end_token)
        reasoning = delta_text[:end_index]
        content = delta_text[end_index + len(self.end_token) :]
        return DeltaMessage(
            reasoning=reasoning if reasoning else None,
            content=content if content else None,
        )

    # No end token yet, all content is reasoning
    return DeltaMessage(reasoning=delta_text)