Language Models

bocoel.GenerativeModel

Bases: Protocol

generate `abstractmethod`

generate(prompts: Sequence[str]) -> Sequence[str]

Generate a sequence of responses given prompts. The length of the response is the same as the prompt. The response would be a continuation of the prompt, and the prompts would be the prefix of the response.

Parameters

prompts: Sequence[str] The prompts to generate responses from.

Returns

A sequence of responses. This has the same length as the prompt. Each response is a string.

Source code in bocoel/models/lms/interfaces/generative.py

@abc.abstractmethod
def generate(self, prompts: Sequence[str], /) -> Sequence[str]:
    """
    Generate a sequence of responses given prompts.
    The length of the response is the same as the prompt.
    The response would be a continuation of the prompt,
    and the prompts would be the prefix of the response.

    Parameters
    ----------

    `prompts: Sequence[str]`
    The prompts to generate responses from.


    Returns
    -------

    A sequence of responses.
    This has the same length as the prompt.
    Each response is a string.
    """

    ...

bocoel.ClassifierModel

Bases: Protocol

choices `property`

choices: Sequence[str]

The choices for this language model.

classify

classify(prompts: Sequence[str]) -> NDArray

Generate logits given prompts.

Parameters

prompts: Sequence[str] The prompts to generate logits from.

Returns

A list of logits Each logit has the same length given by each prompt's choices.

Source code in bocoel/models/lms/interfaces/classifiers.py

def classify(self, prompts: Sequence[str], /) -> NDArray:
    """
    Generate logits given prompts.

    Parameters
    ----------

    `prompts: Sequence[str]`
    The prompts to generate logits from.

    Returns
    -------

    A list of logits
    Each logit has the same length given by each prompt's choices.
    """

    classified = self._classify(prompts)

    if list(classified.shape) != [len(prompts), len(self.choices)]:
        raise ValueError(
            f"Expected logits to have shape {[len(prompts), len(self.choices)]}, "
            f"but got {classified.shape}"
        )

    return classified

_classify `abstractmethod`

_classify(prompts: Sequence[str]) -> NDArray

Generate logits given prompts.

Parameters

prompts: Sequence[str] The prompts to generate logits from.

choices: Sequence[str] Number of choices for this batch of prompts.

Returns

A list of logits. Must have the shaep [batch_size, choices].

Source code in bocoel/models/lms/interfaces/classifiers.py

@abc.abstractmethod
def _classify(self, prompts: Sequence[str], /) -> NDArray:
    """
    Generate logits given prompts.

    Parameters
    ----------

    `prompts: Sequence[str]`
    The prompts to generate logits from.

    `choices: Sequence[str]`
    Number of choices for this batch of prompts.

    Returns
    -------

    A list of logits. Must have the shaep [batch_size, choices].
    """

    ...

bocoel.HuggingfaceTokenizer

HuggingfaceTokenizer(model_path: str, device: str)

Source code in bocoel/models/lms/huggingface/tokenizers.py

def __init__(self, model_path: str, device: str) -> None:
    # Optional dependency.
    from transformers import AutoTokenizer

    # Initializes the tokenizer and pad to the left for sequence generation.
    self._tokenizer = AutoTokenizer.from_pretrained(
        model_path, padding_side="left", truncation_side="left"
    )
    if (eos := self._tokenizer.eos_token) is not None:
        self._tokenizer.pad_token = eos
    else:
        self._tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    if self._tokenizer.sep_token is None:
        self._tokenizer.add_special_tokens({"sep_token": "[SEP]"})

    self._device = device

tokenize

tokenize(prompts: Sequence[str])

Tokenize, pad, truncate, cast to device, and yield the encoded results. Returning BatchEncoding but not marked in the type hint due to optional dependency.

Source code in bocoel/models/lms/huggingface/tokenizers.py

def tokenize(self, prompts: Sequence[str], /):
    """
    Tokenize, pad, truncate, cast to device, and yield the encoded results.
    Returning `BatchEncoding` but not marked in the type hint
    due to optional dependency.
    """
    if not isinstance(prompts, list):
        prompts = list(prompts)

    inputs = self._tokenizer(
        prompts,
        return_tensors="pt",
        max_length=self._tokenizer.model_max_length,
        padding=True,
        truncation=True,
    )
    return inputs.to(self.device)

bocoel.HuggingfaceGenerativeLM

HuggingfaceGenerativeLM(model_path: str, batch_size: int, device: str)

Bases: GenerativeModel

The Huggingface implementation of LanguageModel. This is a wrapper around the Huggingface library, which would try to pull the model from the huggingface hub.

Since huggingface's tokenizer needs padding to the left to work, padding doesn't guarentee the same positional embeddings, and thus, results. If sameness with generating one by one is desired, batch size should be 1.

Source code in bocoel/models/lms/huggingface/generative.py

def __init__(self, model_path: str, batch_size: int, device: str) -> None:
    # Optional dependency.
    from transformers import AutoModelForCausalLM

    self._model_path = model_path
    self._tokenizer = HuggingfaceTokenizer(model_path=model_path, device=device)

    # Model used for generation
    self._model = AutoModelForCausalLM.from_pretrained(model_path)
    self._model.pad_token = self._tokenizer.pad_token

    self._batch_size = batch_size

    self.to(device)

bocoel.HuggingfaceLogitsLM

HuggingfaceLogitsLM(
    model_path: str, batch_size: int, device: str, choices: Sequence[str]
)

Bases: HuggingfaceGenerativeLM, ClassifierModel

The Huggingface implementation of LanguageModel that uses logits in classification. This means that the model would use the logits of ['1', '2', '3', '4', '5'] as the output, if choices = 5, for the current batch of inputs.

Source code in bocoel/models/lms/huggingface/logits.py

def __init__(
    self,
    model_path: str,
    batch_size: int,
    device: str,
    choices: Sequence[str],
) -> None:
    super().__init__(model_path, batch_size, device=device)

    self._choices = choices
    self._encoded_choices = self._encode_tokens(self._choices)

classify

classify(prompts: Sequence[str]) -> NDArray

Generate logits given prompts.

Parameters

prompts: Sequence[str] The prompts to generate logits from.

Returns

A list of logits Each logit has the same length given by each prompt's choices.

Source code in bocoel/models/lms/interfaces/classifiers.py

def classify(self, prompts: Sequence[str], /) -> NDArray:
    """
    Generate logits given prompts.

    Parameters
    ----------

    `prompts: Sequence[str]`
    The prompts to generate logits from.

    Returns
    -------

    A list of logits
    Each logit has the same length given by each prompt's choices.
    """

    classified = self._classify(prompts)

    if list(classified.shape) != [len(prompts), len(self.choices)]:
        raise ValueError(
            f"Expected logits to have shape {[len(prompts), len(self.choices)]}, "
            f"but got {classified.shape}"
        )

    return classified

bocoel.HuggingfaceSequenceLM

HuggingfaceSequenceLM(model_path: str, device: str, choices: Sequence[str])

Bases: ClassifierModel

Source code in bocoel/models/lms/huggingface/sequences.py

def __init__(
    self,
    model_path: str,
    device: str,
    choices: Sequence[str],
) -> None:
    self._model_path = model_path
    self._tokenizer = HuggingfaceTokenizer(model_path=model_path, device=device)

    self._choices = choices

    classifier = AutoModelForSequenceClassification.from_pretrained(model_path)
    self._classifier = classifier.to(device)
    self._classifier.config.pad_token_id = self._tokenizer.pad_token_id

classify

classify(prompts: Sequence[str]) -> NDArray

Generate logits given prompts.

Parameters

prompts: Sequence[str] The prompts to generate logits from.

Returns

A list of logits Each logit has the same length given by each prompt's choices.

Source code in bocoel/models/lms/interfaces/classifiers.py

def classify(self, prompts: Sequence[str], /) -> NDArray:
    """
    Generate logits given prompts.

    Parameters
    ----------

    `prompts: Sequence[str]`
    The prompts to generate logits from.

    Returns
    -------

    A list of logits
    Each logit has the same length given by each prompt's choices.
    """

    classified = self._classify(prompts)

    if list(classified.shape) != [len(prompts), len(self.choices)]:
        raise ValueError(
            f"Expected logits to have shape {[len(prompts), len(self.choices)]}, "
            f"but got {classified.shape}"
        )

    return classified

Language Models

bocoel.GenerativeModel

generate abstractmethod

Parameters

Returns

bocoel.ClassifierModel

choices property

classify

Parameters

Returns

_classify abstractmethod

Parameters

Returns

bocoel.HuggingfaceTokenizer

tokenize

bocoel.HuggingfaceGenerativeLM

bocoel.HuggingfaceLogitsLM

classify

Parameters

Returns

bocoel.HuggingfaceSequenceLM

classify

Parameters

Returns

generate `abstractmethod`

choices `property`

_classify `abstractmethod`