Source code for groq_qa_generator.tokenizer

import tiktoken


[docs] def count_tokens(text): """Counts the number of tokens in a given text using the specified tokenization model. This function uses the "cl100k_base" encoding model from `tiktoken` to encode the input text and counts the total number of tokens generated. Args: text (str): The input text for which tokens need to be counted. Returns: int: The total number of tokens in the input text. """ encoding = tiktoken.get_encoding("cl100k_base") return sum(1 for _ in encoding.encode(text))
[docs] def generate_text_chunks(file_path, chunk_size): """Reads text from a file and splits it into chunks based on a token limit. This function reads the input text file line by line, and accumulates text into chunks such that the total number of tokens in each chunk does not exceed `chunk_size`. When the token limit is reached, the chunk is added to the list of chunks. Args: file_path (str): The path to the text file that needs to be chunked. chunk_size (int): The maximum number of tokens allowed in each chunk. Returns: list of str: A list containing text chunks where each chunk respects the token limit. """ chunks = [] chunk = "" with open(file_path, "r", encoding="utf-8") as f: for line in f: text = line.strip() # If adding the current line exceeds the chunk size, save the current chunk if count_tokens(chunk + text) > chunk_size: chunks.append(chunk.strip()) chunk = text else: # Append the line to the current chunk chunk += " " + text # Append any remaining chunk if chunk: chunks.append(chunk.strip()) return chunks