Source code for robot.parsing.lexer.lexer

#  Copyright 2008-2015 Nokia Networks
#  Copyright 2016-     Robot Framework Foundation
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from itertools import chain

from robot.errors import DataError
from robot.utils import get_error_message, FileReader

from .blocklexers import FileLexer
from .context import InitFileContext, TestCaseFileContext, ResourceFileContext
from .tokenizer import Tokenizer
from .tokens import EOS, Token


[docs]def get_tokens(source, data_only=False, tokenize_variables=False):
    """Parses the given source to tokens.

    :param source: The source where to read the data. Can be a path to
        a source file as a string or as ``pathlib.Path`` object, an already
        opened file object, or Unicode text containing the date directly.
        Source files must be UTF-8 encoded.
    :param data_only: When ``False`` (default), returns all tokens. When set
        to ``True``, omits separators, comments, continuation markers, and
        other non-data tokens.
    :param tokenize_variables: When ``True``, possible variables in keyword
        arguments and elsewhere are tokenized. See the
        :meth:`~robot.parsing.lexer.tokens.Token.tokenize_variables`
        method for details.

    Returns a generator that yields :class:`~robot.parsing.lexer.tokens.Token`
    instances.
    """
    lexer = Lexer(TestCaseFileContext(), data_only, tokenize_variables)
    lexer.input(source)
    return lexer.get_tokens()


[docs]def get_resource_tokens(source, data_only=False, tokenize_variables=False):
    """Parses the given source to resource file tokens.

    Otherwise same as :func:`get_tokens` but the source is considered to be
    a resource file. This affects, for example, what settings are valid.
    """
    lexer = Lexer(ResourceFileContext(), data_only, tokenize_variables)
    lexer.input(source)
    return lexer.get_tokens()


[docs]def get_init_tokens(source, data_only=False, tokenize_variables=False):
    """Parses the given source to init file tokens.

    Otherwise same as :func:`get_tokens` but the source is considered to be
    a suite initialization file. This affects, for example, what settings are
    valid.
    """
    lexer = Lexer(InitFileContext(), data_only, tokenize_variables)
    lexer.input(source)
    return lexer.get_tokens()


[docs]class Lexer(object):

    def __init__(self, ctx, data_only=False, tokenize_variables=False):
        self.lexer = FileLexer(ctx)
        self.data_only = data_only
        self.tokenize_variables = tokenize_variables
        self.statements = []

[docs]    def input(self, source):
        for statement in Tokenizer().tokenize(self._read(source),
                                              self.data_only):
            # Store all tokens but pass only data tokens to lexer.
            self.statements.append(statement)
            if self.data_only:
                data = statement[:]
            else:
                # Separators, comments, etc. already have type, data doesn't.
                data = [t for t in statement if t.type is None]
            if data:
                self.lexer.input(data)

    def _read(self, source):
        try:
            with FileReader(source, accept_text=True) as reader:
                return reader.read()
        except:
            raise DataError(get_error_message())

[docs]    def get_tokens(self):
        self.lexer.lex()
        statements = self._handle_old_for(self.statements)
        if not self.data_only:
            statements = chain.from_iterable(
                self._split_trailing_commented_and_empty_lines(s)
                for s in statements
            )
        tokens = self._get_tokens(statements)
        if self.tokenize_variables:
            tokens = self._tokenize_variables(tokens)
        return tokens

    def _get_tokens(self, statements):
        # Setting local variables is performance optimization to avoid
        # unnecessary lookups and attribute access.
        if self.data_only:
            ignored_types = {None, Token.COMMENT_HEADER, Token.COMMENT,
                             Token.OLD_FOR_INDENT}
        else:
            ignored_types = {None}
        name_types = (Token.TESTCASE_NAME, Token.KEYWORD_NAME)
        separator_type = Token.SEPARATOR
        eol_type = Token.EOL
        for statement in statements:
            name_seen = False
            separator_after_name = None
            prev_token = None
            for token in statement:
                token_type = token.type
                if token_type in ignored_types:
                    continue
                if name_seen:
                    if token_type == separator_type:
                        separator_after_name = token
                        continue
                    if token_type != eol_type:
                        yield EOS.from_token(prev_token)
                    if separator_after_name:
                        yield separator_after_name
                    name_seen = False
                if token_type in name_types:
                    name_seen = True
                prev_token = token
                yield token
            if prev_token:
                yield EOS.from_token(prev_token)

    def _handle_old_for(self, statements):
        end_statement = [Token(Token.SEPARATOR), Token(Token.END)]
        old_for = False
        for statement in statements:
            marker = self._get_first_data_token(statement)
            if marker:
                if marker.type == Token.OLD_FOR_INDENT:
                    old_for = True
                elif old_for:
                    if marker.type == Token.END:
                        # We get here if block has been indented with '\' but
                        # there is also 'END'. The former is deprecated and
                        # removing the value causes a deprecation warning.
                        marker.value = ''
                    else:
                        yield end_statement
                    old_for = False
            yield statement
        if old_for:
            yield end_statement

    def _get_first_data_token(self, statement):
        non_data_tokens = Token.NON_DATA_TOKENS + (None,)
        for token in statement:
            if token.type not in non_data_tokens:
                return token
        return None

    def _split_trailing_commented_and_empty_lines(self, statement):
        lines = self._split_to_lines(statement)
        commented_or_empty = []
        for line in reversed(lines):
            if not self._is_commented_or_empty(line):
                break
            commented_or_empty.append(line)
        if not commented_or_empty:
            return [statement]
        lines = lines[:-len(commented_or_empty)]
        statement = list(chain.from_iterable(lines))
        return [statement] + list(reversed(commented_or_empty))

    def _split_to_lines(self, statement):
        lines = []
        current = []
        for token in statement:
            current.append(token)
            if token.type == Token.EOL:
                lines.append(current)
                current = []
        if current:
            lines.append(current)
        return lines

    def _is_commented_or_empty(self, line):
        separator_or_ignore = (Token.SEPARATOR, None)
        comment_or_eol = (Token.COMMENT, Token.EOL)
        for token in line:
            if token.type not in separator_or_ignore:
                return token.type in comment_or_eol
        return False

    def _tokenize_variables(self, tokens):
        for token in tokens:
            for t in token.tokenize_variables():
                yield t