Source code for robot.parsing.lexer.tokenizer

#  Copyright 2008-2015 Nokia Networks
#  Copyright 2016-     Robot Framework Foundation
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import re

from robot.utils import rstrip

from .tokens import Token


[docs]class Tokenizer(object):
    _space_splitter = re.compile(r'(\s{2,}|\t)', re.UNICODE)
    _pipe_splitter = re.compile(r'((?:\A|\s+)\|(?:\s+|\Z))', re.UNICODE)

[docs]    def tokenize(self, data, data_only=False):
        current = []
        for lineno, line in enumerate(data.splitlines(not data_only), start=1):
            tokens = self._tokenize_line(line, lineno, not data_only)
            tokens, starts_new = self._cleanup_tokens(tokens, data_only)
            if starts_new:
                if current:
                    yield current
                current = tokens
            else:
                current.extend(tokens)
        yield current

    def _tokenize_line(self, line, lineno, include_separators=True):
        # Performance optimized code.
        tokens = []
        append = tokens.append
        offset = 0
        if line[:1] != '|':
            splitter = self._split_from_spaces
        else:
            splitter = self._split_from_pipes
        for value, is_data in splitter(rstrip(line)):
            if is_data:
                append(Token(None, value, lineno, offset))
            elif include_separators:
                append(Token(Token.SEPARATOR, value, lineno, offset))
            offset += len(value)
        if include_separators:
            trailing_whitespace = line[len(rstrip(line)):]
            append(Token(Token.EOL, trailing_whitespace, lineno, offset))
        return tokens

    def _split_from_spaces(self, line):
        is_data = True
        for value in self._space_splitter.split(line):
            yield value, is_data
            is_data = not is_data

    def _split_from_pipes(self, line):
        splitter = self._pipe_splitter
        _, separator, rest = splitter.split(line, 1)
        yield separator, False
        while splitter.search(rest):
            token, separator, rest = splitter.split(rest, 1)
            yield token, True
            yield separator, False
        yield rest, True

    def _cleanup_tokens(self, tokens, data_only):
        has_data = self._handle_comments(tokens)
        continues = self._handle_continuation(tokens)
        self._remove_trailing_empty(tokens)
        if continues:
            self._remove_leading_empty(tokens)
            self._ensure_data_after_continuation(tokens)
        if data_only:
            tokens = self._remove_non_data(tokens)
        return tokens, has_data and not continues

    def _handle_comments(self, tokens):
        has_data = False
        commented = False
        for token in tokens:
            if token.type is None:
                if token.value.startswith('#') or commented:
                    token.type = Token.COMMENT
                    commented = True
                elif token.value:
                    has_data = True
        return has_data

    def _handle_continuation(self, tokens):
        for token in tokens:
            if token.value == '...' and token.type is None:
                token.type = Token.CONTINUATION
                return True
            elif token.value and token.type != Token.SEPARATOR:
                return False
        return False

    def _remove_trailing_empty(self, tokens):
        # list() needed w/ IronPython, otherwise reversed() alone is enough.
        # https://github.com/IronLanguages/ironpython2/issues/699
        for token in reversed(list(tokens)):
            if not token.value and token.type != Token.EOL:
                tokens.remove(token)
            elif token.type is None:
                break

    def _remove_leading_empty(self, tokens):
        data_or_continuation = (None, Token.CONTINUATION)
        for token in list(tokens):
            if not token.value:
                tokens.remove(token)
            elif token.type in data_or_continuation:
                break

    def _ensure_data_after_continuation(self, tokens):
        if not any(t.type is None for t in tokens):
            cont = self._find_continuation(tokens)
            token = Token(lineno=cont.lineno, col_offset=cont.end_col_offset)
            tokens.insert(tokens.index(cont) + 1, token)

    def _find_continuation(self, tokens):
        for token in tokens:
            if token.type == Token.CONTINUATION:
                return token

    def _remove_non_data(self, tokens):
        return [t for t in tokens if t.type is None]