Skip to content

PyGPT4All API Reference

pygpt4all.models.gpt4all

GPT4ALL with llama.cpp backend through pyllamacpp

GPT4All

GPT4All(
    model_path,
    prompt_context="",
    prompt_prefix="",
    prompt_suffix="",
    log_level=logging.ERROR,
    n_ctx=512,
    seed=0,
    n_parts=-1,
    f16_kv=False,
    logits_all=False,
    vocab_only=False,
    use_mlock=False,
    embedding=False,
)

Bases: pyllamacpp.model.Model

GPT4All model

Base: pyllamacpp.model.Model

Example usage

from pygpt4all.models.gpt4all import GPT4All

model = GPT4All('path/to/gpt4all/model')
for token in model.generate("Tell me a joke ?"):
    print(token, end='', flush=True)

Parameters:

Name Type Description Default
model_path str

the path to the gpt4all model

required
prompt_context str

the global context of the interaction

''
prompt_prefix str

the prompt prefix

''
prompt_suffix str

the prompt suffix

''
log_level int

logging level, set to ERROR by default

logging.ERROR
n_ctx int

LLaMA context

512
seed int

random seed

0
n_parts int

LLaMA n_parts

-1
f16_kv bool

use fp16 for KV cache

False
logits_all bool

the llama_eval() call computes all logits, not just the last one

False
vocab_only bool

only load the vocabulary, no weights

False
use_mlock bool

force system to keep model in RAM

False
embedding bool

embedding mode only

False
Source code in pygpt4all/models/gpt4all.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(self,
             model_path: str,
             prompt_context: str = '',
             prompt_prefix: str = '',
             prompt_suffix: str = '',
             log_level: int = logging.ERROR,
             n_ctx: int = 512,
             seed: int = 0,
             n_parts: int = -1,
             f16_kv: bool = False,
             logits_all: bool = False,
             vocab_only: bool = False,
             use_mlock: bool = False,
             embedding: bool = False):
    """
    :param model_path: the path to the gpt4all model
    :param prompt_context: the global context of the interaction
    :param prompt_prefix: the prompt prefix
    :param prompt_suffix: the prompt suffix
    :param log_level: logging level, set to ERROR by default
    :param n_ctx: LLaMA context
    :param seed: random seed
    :param n_parts: LLaMA n_parts
    :param f16_kv: use fp16 for KV cache
    :param logits_all: the llama_eval() call computes all logits, not just the last one
    :param vocab_only: only load the vocabulary, no weights
    :param use_mlock: force system to keep model in RAM
    :param embedding: embedding mode only
    """
    # set logging level
    set_log_level(log_level)
    super(GPT4All, self).__init__(model_path=model_path,
                                  prompt_context=prompt_context,
                                  prompt_prefix=prompt_prefix,
                                  prompt_suffix=prompt_suffix,
                                  log_level=log_level,
                                  n_ctx=n_ctx,
                                  seed=seed,
                                  n_parts=n_parts,
                                  f16_kv=f16_kv,
                                  logits_all=logits_all,
                                  vocab_only=vocab_only,
                                  use_mlock=use_mlock,
                                  embedding=embedding)

pygpt4all.models.gpt4all_j

GPT4ALL with ggml backend

GPT4All_J

GPT4All_J(
    model_path,
    prompt_context="",
    prompt_prefix="",
    prompt_suffix="",
    log_level=logging.ERROR,
)

Bases: pygptj.model.Model

GPT4ALL-J model

Example usage

from pygpt4all.models.gpt4all_j import GPT4All_J

model = GPT4All_J('.path/to/gpr4all-j/model')
for token in model.generate("Tell me a joke ?"):
    print(token, end='', flush=True)

Parameters:

Name Type Description Default
model_path str

The path to a gpt4all-j model

required
prompt_context str

the global context of the interaction

''
prompt_prefix str

the prompt prefix

''
prompt_suffix str

the prompt suffix

''
log_level int

logging level, set to ERROR by default

logging.ERROR
Source code in pygpt4all/models/gpt4all_j.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(self,
             model_path: str,
             prompt_context: str = '',
             prompt_prefix: str = '',
             prompt_suffix: str = '',
             log_level: int = logging.ERROR):
    """
    :param model_path: The path to a gpt4all-j model
    :param prompt_context: the global context of the interaction
    :param prompt_prefix: the prompt prefix
    :param prompt_suffix: the prompt suffix
    :param log_level: logging level, set to ERROR by default
    """
    # set logging level
    set_log_level(log_level)
    super(GPT4All_J, self).__init__(model_path=model_path,
                                    prompt_context=prompt_context,
                                    prompt_prefix=prompt_prefix,
                                    prompt_suffix=prompt_suffix,
                                    log_level=log_level)

Bases

pyllamacpp.model

This module contains a simple Python API around llama.cpp

Model

Model(
    model_path,
    prompt_context="",
    prompt_prefix="",
    prompt_suffix="",
    log_level=logging.ERROR,
    n_ctx=512,
    seed=0,
    n_parts=-1,
    f16_kv=False,
    logits_all=False,
    vocab_only=False,
    use_mlock=False,
    embedding=False,
)

A simple Python class on top of llama.cpp

Example usage

from pyllamacpp.model import Model

model = Model(ggml_model='path/to/ggml/model')
for token in model.generate("Tell me a joke ?"):
    print(token, end='', flush=True)

Parameters:

Name Type Description Default
model_path str

the path to the ggml model

required
prompt_context str

the global context of the interaction

''
prompt_prefix str

the prompt prefix

''
prompt_suffix str

the prompt suffix

''
log_level int

logging level, set to INFO by default

logging.ERROR
n_ctx int

LLaMA context

512
seed int

random seed

0
n_parts int

LLaMA n_parts

-1
f16_kv bool

use fp16 for KV cache

False
logits_all bool

the llama_eval() call computes all logits, not just the last one

False
vocab_only bool

only load the vocabulary, no weights

False
use_mlock bool

force system to keep model in RAM

False
embedding bool

embedding mode only

False
Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def __init__(self,
             model_path: str,
             prompt_context: str = '',
             prompt_prefix: str = '',
             prompt_suffix: str = '',
             log_level: int = logging.ERROR,
             n_ctx: int = 512,
             seed: int = 0,
             n_parts: int = -1,
             f16_kv: bool = False,
             logits_all: bool = False,
             vocab_only: bool = False,
             use_mlock: bool = False,
             embedding: bool = False):
    """
    :param model_path: the path to the ggml model
    :param prompt_context: the global context of the interaction
    :param prompt_prefix: the prompt prefix
    :param prompt_suffix: the prompt suffix
    :param log_level: logging level, set to INFO by default
    :param n_ctx: LLaMA context
    :param seed: random seed
    :param n_parts: LLaMA n_parts
    :param f16_kv: use fp16 for KV cache
    :param logits_all: the llama_eval() call computes all logits, not just the last one
    :param vocab_only: only load the vocabulary, no weights
    :param use_mlock: force system to keep model in RAM
    :param embedding: embedding mode only
    """

    # set logging level
    set_log_level(log_level)
    self._ctx = None

    if not Path(model_path).is_file():
        raise Exception(f"File {model_path} not found!")

    self.llama_params = pp.llama_context_default_params()
    # update llama_params
    self.llama_params.n_ctx = n_ctx
    self.llama_params.seed = seed
    self.llama_params.n_parts = n_parts
    self.llama_params.f16_kv = f16_kv
    self.llama_params.logits_all = logits_all
    self.llama_params.vocab_only = vocab_only
    self.llama_params.use_mlock = use_mlock
    self.llama_params.embedding = embedding

    self._ctx = pp.llama_init_from_file(model_path, self.llama_params)

    # gpt params
    self.gpt_params = pp.gpt_params()

    self.res = ""

    self._n_ctx = pp.llama_n_ctx(self._ctx)
    self._last_n_tokens = [0] * self._n_ctx  # n_ctx elements
    self._n_past = 0
    self.prompt_cntext = prompt_context
    self.prompt_prefix = prompt_prefix
    self.prompt_suffix = prompt_suffix

    self._prompt_context_tokens = []
    self._prompt_prefix_tokens = []
    self._prompt_suffix_tokens = []

    self.reset()

reset

reset()

Resets the context

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
106
107
108
109
110
111
112
def reset(self) -> None:
    """Resets the context"""
    self._prompt_context_tokens = pp.llama_tokenize(self._ctx, self.prompt_cntext, True)
    self._prompt_prefix_tokens = pp.llama_tokenize(self._ctx, self.prompt_prefix, True)
    self._prompt_suffix_tokens = pp.llama_tokenize(self._ctx, self.prompt_suffix, True)
    self._last_n_tokens = [0] * self._n_ctx  # n_ctx elements
    self._n_past = 0

tokenize

tokenize(text)

Returns a list of tokens for the text

Parameters:

Name Type Description Default
text str

text to be tokenized

required

Returns:

Type Description

List of tokens

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
114
115
116
117
118
119
120
def tokenize(self, text:str):
    """
    Returns a list of tokens for the text
    :param text: text to be tokenized
    :return: List of tokens
    """
    return pp.llama_tokenize(self._ctx, text, True)

detokenize

detokenize(tokens)

Returns a list of tokens for the text

Parameters:

Name Type Description Default
text

text to be tokenized

required

Returns:

Type Description

A string representing the text extracted from the tokens

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
122
123
124
125
126
127
128
def detokenize(self, tokens:list):
    """
    Returns a list of tokens for the text
    :param text: text to be tokenized
    :return: A string representing the text extracted from the tokens
    """
    return pp.llama_tokens_to_str(self._ctx, tokens)

generate

generate(
    prompt,
    n_predict=None,
    antiprompt=None,
    infinite_generation=False,
    n_threads=4,
    repeat_last_n=64,
    top_k=40,
    top_p=0.95,
    temp=0.8,
    repeat_penalty=1.1,
)

Runs llama.cpp inference and yields new predicted tokens from the prompt provided as input

Parameters:

Name Type Description Default
prompt str

The prompt :)

required
n_predict Union[None, int]

if n_predict is not None, the inference will stop if it reaches n_predict tokens, otherwise it will continue until EOS

None
antiprompt str

aka the stop word, the generation will stop if this word is predicted, keep it None to handle it in your own way

None
infinite_generation bool

set it to True to make the generation go infinitely

False
n_threads int

The number of CPU threads

4
repeat_last_n int

last n tokens to penalize

64
top_k int

top K sampling parameter

40
top_p float

top P sampling parameter

0.95
temp float

temperature

0.8
repeat_penalty float

repeat penalty sampling parameter

1.1

Returns:

Type Description
Generator

Tokens generator

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def generate(self,
             prompt: str,
             n_predict: Union[None, int] = None,
             antiprompt: str = None,
             infinite_generation: bool = False,
             n_threads: int = 4,
             repeat_last_n: int = 64,
             top_k: int = 40,
             top_p: float = 0.95,
             temp: float = 0.8,
             repeat_penalty: float = 1.10) -> Generator:
    """
    Runs llama.cpp inference and yields new predicted tokens from the prompt provided as input

    :param prompt: The prompt :)
    :param n_predict: if n_predict is not None, the inference will stop if it reaches `n_predict` tokens, otherwise
                      it will continue until `EOS`
    :param antiprompt: aka the stop word, the generation will stop if this word is predicted,
                       keep it None to handle it in your own way
    :param infinite_generation: set it to `True` to make the generation go infinitely
    :param n_threads: The number of CPU threads
    :param repeat_last_n: last n tokens to penalize
    :param top_k: top K sampling parameter
    :param top_p: top P sampling parameter
    :param temp: temperature
    :param repeat_penalty: repeat penalty sampling parameter
    :return: Tokens generator
    """
    input_tokens = self._prompt_prefix_tokens + pp.llama_tokenize(self._ctx, prompt,
                                                                  True) + self._prompt_suffix_tokens
    if len(input_tokens) > self._n_ctx - 4:
        raise Exception('Prompt too long!')
    predicted_tokens = []
    predicted_token = 0

    # add global context for the first time
    if self._n_past == 0:
        for tok in self._prompt_context_tokens:
            predicted_tokens.append(tok)
            self._last_n_tokens.pop(0)
            self._last_n_tokens.append(tok)

    # consume input tokens
    for tok in input_tokens:
        predicted_tokens.append(tok)
        self._last_n_tokens.pop(0)
        self._last_n_tokens.append(tok)

    n_remain = 0
    if antiprompt is not None:
        sequence_queue = []
        stop_word = antiprompt.strip()

    while infinite_generation or predicted_token != pp.llama_token_eos():
        if len(predicted_tokens) > 0:
            if (pp.llama_eval(self._ctx,
                              predicted_tokens,
                              len(predicted_tokens),
                              self._n_past,
                              n_threads)):
                raise Exception("failed to eval the model!")
            self._n_past += len(predicted_tokens)
            predicted_tokens.clear()

        predicted_token = pp.llama_sample_top_p_top_k(self._ctx,
                                                      self._last_n_tokens[self._n_ctx - repeat_last_n:],
                                                      repeat_last_n,
                                                      top_k,
                                                      top_p,
                                                      temp,
                                                      repeat_penalty)

        predicted_tokens.append(predicted_token)
        # tokens come as raw undecoded bytes,
        # and we decode them, replacing those that can't be decoded.
        # i decoded here for fear of breaking the stopword logic, 
        token_str = pp.llama_token_to_str(self._ctx, predicted_token).decode('utf-8', "replace")
        if antiprompt is not None:
            if token_str == '\n':
                sequence_queue.append(token_str)
                continue
            if len(sequence_queue) != 0:
                if stop_word.startswith(''.join(sequence_queue).strip()):
                    sequence_queue.append(token_str)
                    if ''.join(sequence_queue).strip() == stop_word:
                        break
                    else:
                        continue
                else:
                    # consume sequence queue tokens
                    while len(sequence_queue) != 0:
                        yield sequence_queue.pop(0)
                    sequence_queue = []
        self._last_n_tokens.pop(0)
        self._last_n_tokens.append(predicted_token)
        yield token_str
        if n_predict is not None:
            if n_remain == n_predict:
                break
            else:
                n_remain += 1

cpp_generate

cpp_generate(
    prompt,
    n_predict=128,
    new_text_callback=None,
    n_threads=4,
    repeat_last_n=64,
    top_k=40,
    top_p=0.95,
    temp=0.8,
    repeat_penalty=1.1,
    n_batch=8,
    n_keep=0,
    interactive=False,
    antiprompt=[],
    ignore_eos=False,
    instruct=False,
    verbose_prompt=False,
)

The generate function from llama.cpp

Parameters:

Name Type Description Default
prompt str

the prompt

required
n_predict int

number of tokens to generate

128
new_text_callback Callable[[bytes], None]

a callback function called when new text is generated, default None

None
n_threads int

The number of CPU threads

4
repeat_last_n int

last n tokens to penalize

64
top_k int

top K sampling parameter

40
top_p float

top P sampling parameter

0.95
temp float

temperature

0.8
repeat_penalty float

repeat penalty sampling parameter

1.1
n_batch int

GPT params n_batch

8
n_keep int

GPT params n_keep

0
interactive bool

interactive communication

False
antiprompt List

list of anti prompts

[]
ignore_eos bool

Ignore LLaMA EOS

False
instruct bool

Activate instruct mode

False
verbose_prompt bool

verbose prompt

False

Returns:

Type Description
str

the new generated text

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def cpp_generate(self, prompt: str,
                 n_predict: int = 128,
                 new_text_callback: Callable[[bytes], None] = None,
                 n_threads: int = 4,
                 repeat_last_n: int = 64,
                 top_k: int = 40,
                 top_p: float = 0.95,
                 temp: float = 0.8,
                 repeat_penalty: float = 1.10,
                 n_batch: int = 8,
                 n_keep: int = 0,
                 interactive: bool = False,
                 antiprompt: List = [],
                 ignore_eos: bool = False,
                 instruct: bool = False,
                 verbose_prompt: bool = False,
                 ) -> str:
    """
    The generate function from `llama.cpp`

    :param prompt: the prompt
    :param n_predict: number of tokens to generate
    :param new_text_callback: a callback function called when new text is generated, default `None`
    :param n_threads: The number of CPU threads
    :param repeat_last_n: last n tokens to penalize
    :param top_k: top K sampling parameter
    :param top_p: top P sampling parameter
    :param temp: temperature
    :param repeat_penalty: repeat penalty sampling parameter
    :param n_batch: GPT params n_batch
    :param n_keep: GPT params n_keep
    :param interactive: interactive communication
    :param antiprompt: list of anti prompts
    :param ignore_eos: Ignore LLaMA EOS
    :param instruct: Activate instruct mode
    :param verbose_prompt: verbose prompt
    :return: the new generated text
    """
    self.gpt_params.prompt = prompt
    self.gpt_params.n_predict = n_predict
    # update other params if any
    self.gpt_params.n_threads = n_threads
    self.gpt_params.repeat_last_n = repeat_last_n
    self.gpt_params.top_k = top_k
    self.gpt_params.top_p = top_p
    self.gpt_params.temp = temp
    self.gpt_params.repeat_penalty = repeat_penalty
    self.gpt_params.n_batch = n_batch
    self.gpt_params.n_keep = n_keep
    self.gpt_params.interactive = interactive
    self.gpt_params.antiprompt = antiprompt
    self.gpt_params.ignore_eos = ignore_eos
    self.gpt_params.instruct = instruct
    self.gpt_params.verbose_prompt = verbose_prompt

    # assign new_text_callback
    self.res = ""
    Model._new_text_callback = new_text_callback

    # run the prediction
    pp.llama_generate(self._ctx, self.gpt_params, self._call_new_text_callback)
    return self.res

get_params staticmethod

get_params(params)

Returns a dict representation of the params

Returns:

Type Description
dict

params dict

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pyllamacpp/model.py
315
316
317
318
319
320
321
322
323
324
325
326
@staticmethod
def get_params(params) -> dict:
    """
    Returns a `dict` representation of the params
    :return: params dict
    """
    res = {}
    for param in dir(params):
        if param.startswith('__'):
            continue
        res[param] = getattr(params, param)
    return res

pygptj.model

This module contains a simple Python API around gpt-j

Model

Model(
    model_path,
    prompt_context="",
    prompt_prefix="",
    prompt_suffix="",
    log_level=logging.ERROR,
)

GPT-J model

Example usage

from pygptj.model import Model

model = Model(ggml_model='path/to/ggml/model')
for token in model.generate("Tell me a joke ?"):
    print(token, end='', flush=True)

Parameters:

Name Type Description Default
model_path str

The path to a gpt-j ggml model

required
prompt_context str

the global context of the interaction

''
prompt_prefix str

the prompt prefix

''
prompt_suffix str

the prompt suffix

''
log_level int

logging level

logging.ERROR
Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pygptj/model.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def __init__(self,
             model_path: str,
             prompt_context: str = '',
             prompt_prefix: str = '',
             prompt_suffix: str = '',
             log_level: int = logging.ERROR):
    """
    :param model_path: The path to a gpt-j `ggml` model
    :param prompt_context: the global context of the interaction
    :param prompt_prefix: the prompt prefix
    :param prompt_suffix: the prompt suffix
    :param log_level: logging level
    """
    # set logging level
    set_log_level(log_level)
    self._ctx = None

    if not Path(model_path).is_file():
        raise Exception(f"File {model_path} not found!")

    self.model_path = model_path

    self._model = pp.gptj_model()
    self._vocab = pp.gpt_vocab()

    # load model
    self._load_model()

    # gpt params
    self.gpt_params = pp.gptj_gpt_params()
    self.hparams = pp.gptj_hparams()

    self.res = ""

    self.logits = []

    self._n_past = 0
    self.prompt_cntext = prompt_context
    self.prompt_prefix = prompt_prefix
    self.prompt_suffix = prompt_suffix

    self._prompt_context_tokens = []
    self._prompt_prefix_tokens = []
    self._prompt_suffix_tokens = []

    self.reset()

generate

generate(
    prompt,
    n_predict=None,
    antiprompt=None,
    seed=None,
    n_threads=4,
    top_k=40,
    top_p=0.9,
    temp=0.9,
)

Runs GPT-J inference and yields new predicted tokens

Parameters:

Name Type Description Default
prompt str

The prompt :)

required
n_predict Union[None, int]

if n_predict is not None, the inference will stop if it reaches n_predict tokens, otherwise it will continue until end of text token

None
antiprompt str

aka the stop word, the generation will stop if this word is predicted, keep it None to handle it in your own way

None
seed int

random seed

None
n_threads int

The number of CPU threads

4
top_k int

top K sampling parameter

40
top_p float

top P sampling parameter

0.9
temp float

temperature

0.9

Returns:

Type Description
Generator

Tokens generator

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pygptj/model.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def generate(self,
             prompt: str,
             n_predict: Union[None, int] = None,
             antiprompt: str = None,
             seed: int = None,
             n_threads: int = 4,
             top_k: int = 40,
             top_p: float = 0.9,
             temp: float = 0.9,
             ) -> Generator:
    """
     Runs GPT-J inference and yields new predicted tokens

    :param prompt: The prompt :)
    :param n_predict: if n_predict is not None, the inference will stop if it reaches `n_predict` tokens, otherwise
                      it will continue until `end of text` token
    :param antiprompt: aka the stop word, the generation will stop if this word is predicted,
                       keep it None to handle it in your own way
    :param seed: random seed
    :param n_threads: The number of CPU threads
    :param top_k: top K sampling parameter
    :param top_p: top P sampling parameter
    :param temp: temperature

    :return: Tokens generator
    """
    if seed is None or seed < 0:
        seed = int(time.time())

    logging.info(f'seed = {seed}')

    if self._n_past == 0 or antiprompt is None:
        # add the prefix to the context
        embd_inp = self._prompt_prefix_tokens + pp.gpt_tokenize(self._vocab, prompt) + self._prompt_suffix_tokens
    else:
        # do not add the prefix again as it is already in the previous generated context
        embd_inp = pp.gpt_tokenize(self._vocab, prompt) + self._prompt_suffix_tokens

    if n_predict is not None:
        n_predict = min(n_predict, self.hparams.n_ctx - len(embd_inp))
    logging.info(f'Number of tokens in prompt = {len(embd_inp)}')

    embd = []
    # add global context for the first time
    if self._n_past == 0:
        for tok in self._prompt_context_tokens:
            embd.append(tok)

    # consume input tokens
    for tok in embd_inp:
        embd.append(tok)

    # determine the required inference memory per token:
    mem_per_token = 0
    logits, mem_per_token = pp.gptj_eval(self._model, n_threads, 0, [0, 1, 2, 3], mem_per_token)

    i = len(embd) - 1
    id = 0
    if antiprompt is not None:
        sequence_queue = []
        stop_word = antiprompt.strip()

    while id != 50256:  # end of text token
        if n_predict is not None:  # break the generation if n_predict
            if i >= (len(embd_inp) + n_predict):
                break
        i += 1
        # predict
        if len(embd) > 0:
            try:
                logits, mem_per_token = pp.gptj_eval(self._model, n_threads, self._n_past, embd, mem_per_token)
                self.logits.append(logits)
            except Exception as e:
                print(f"Failed to predict\n {e}")
                return

        self._n_past += len(embd)
        embd.clear()

        if i >= len(embd_inp):
            # sample next token
            n_vocab = self.hparams.n_vocab
            t_start_sample_us = int(round(time.time() * 1000000))
            id = pp.gpt_sample_top_k_top_p(self._vocab, logits[-n_vocab:], top_k, top_p, temp, seed)
            if id == 50256:  # end of text token
                break
            # add the token to the context
            embd.append(id)
            token = self._vocab.id_to_token[id]
            # antiprompt
            if antiprompt is not None:
                if token == '\n':
                    sequence_queue.append(token)
                    continue
                if len(sequence_queue) != 0:
                    if stop_word.startswith(''.join(sequence_queue).strip()):
                        sequence_queue.append(token)
                        if ''.join(sequence_queue).strip() == stop_word:
                            break
                        else:
                            continue
                    else:
                        # consume sequence queue tokens
                        while len(sequence_queue) != 0:
                            yield sequence_queue.pop(0)
                        sequence_queue = []

            yield token

cpp_generate

cpp_generate(
    prompt,
    new_text_callback=None,
    logits_callback=None,
    n_predict=128,
    seed=-1,
    n_threads=4,
    top_k=40,
    top_p=0.9,
    temp=0.9,
    n_batch=8,
)

Runs the inference to cpp generate function

Parameters:

Name Type Description Default
prompt str

the prompt

required
new_text_callback Callable[[str], None]

a callback function called when new text is generated, default None

None
logits_callback Callable[[np.ndarray], None]

a callback function to access the logits on every inference

None
n_predict int

number of tokens to generate

128
seed int

The random seed

-1
n_threads int

Number of threads

4
top_k int

top_k sampling parameter

40
top_p float

top_p sampling parameter

0.9
temp float

temperature sampling parameter

0.9
n_batch int

batch size for prompt processing

8

Returns:

Type Description
str

the new generated text

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pygptj/model.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
def cpp_generate(self,
                 prompt: str,
                 new_text_callback: Callable[[str], None] = None,
                 logits_callback: Callable[[np.ndarray], None] = None,
                 n_predict: int = 128,
                 seed: int = -1,
                 n_threads: int = 4,
                 top_k: int = 40,
                 top_p: float = 0.9,
                 temp: float = 0.9,
                 n_batch: int = 8,
                 ) -> str:
    """
    Runs the inference to cpp generate function

    :param prompt: the prompt
    :param new_text_callback: a callback function called when new text is generated, default `None`
    :param logits_callback: a callback function to access the logits on every inference
    :param n_predict: number of tokens to generate
    :param seed: The random seed
    :param n_threads: Number of threads
    :param top_k: top_k sampling parameter
    :param top_p: top_p sampling parameter
    :param temp: temperature sampling parameter
    :param n_batch: batch size for prompt processing

    :return: the new generated text
    """
    self.gpt_params.prompt = prompt
    self.gpt_params.n_predict = n_predict
    self.gpt_params.seed = seed
    self.gpt_params.n_threads = n_threads
    self.gpt_params.top_k = top_k
    self.gpt_params.top_p = top_p
    self.gpt_params.temp = temp
    self.gpt_params.n_batch = n_batch

    # assign new_text_callback
    self.res = ""
    Model._new_text_callback = new_text_callback

    # assign _logits_callback used for saving logits, token by token
    Model._logits_callback = logits_callback

    # run the prediction
    pp.gptj_generate(self.gpt_params, self._model, self._vocab, self._call_new_text_callback,
                     self._call_logits_callback)
    return self.res

braindump

braindump(path)

Dumps the logits to .npy

Parameters:

Name Type Description Default
path str

Output path

required

Returns:

Type Description
None

None

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pygptj/model.py
300
301
302
303
304
305
306
def braindump(self, path: str) -> None:
    """
    Dumps the logits to .npy
    :param path: Output path
    :return: None
    """
    np.save(path, np.asarray(self.logits))

reset

reset()

Resets the context

Returns:

Type Description
None

None

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pygptj/model.py
308
309
310
311
312
313
314
315
316
def reset(self) -> None:
    """
    Resets the context
    :return: None
    """
    self._n_past = 0
    self._prompt_context_tokens = pp.gpt_tokenize(self._vocab, self.prompt_cntext)
    self._prompt_prefix_tokens = pp.gpt_tokenize(self._vocab, self.prompt_prefix)
    self._prompt_suffix_tokens = pp.gpt_tokenize(self._vocab, self.prompt_suffix)

get_params staticmethod

get_params(params)

Returns a dict representation of the params

Returns:

Type Description
dict

params dict

Source code in /opt/hostedtoolcache/Python/3.11.3/x64/lib/python3.11/site-packages/pygptj/model.py
318
319
320
321
322
323
324
325
326
327
328
329
@staticmethod
def get_params(params) -> dict:
    """
    Returns a `dict` representation of the params
    :return: params dict
    """
    res = {}
    for param in dir(params):
        if param.startswith('__'):
            continue
        res[param] = getattr(params, param)
    return res