# Copyright 2008-2015 Nokia Networks
# Copyright 2016- Robot Framework Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from robot.output import LOGGER
from robot.utils import PY2
if PY2:
from htmlentitydefs import entitydefs
from HTMLParser import HTMLParser
else:
from html.entities import entitydefs
from html.parser import HTMLParser
unichr = chr
NON_BREAKING_SPACE = u'\xA0'
[docs]class HtmlReader(HTMLParser):
IGNORE = 0
INITIAL = 1
PROCESS = 2
def __init__(self):
HTMLParser.__init__(self)
self._encoding = 'ISO-8859-1'
self._handlers = {'table_start' : self.table_start,
'table_end' : self.table_end,
'tr_start' : self.tr_start,
'tr_end' : self.tr_end,
'td_start' : self.td_start,
'td_end' : self.td_end,
'th_start' : self.td_start,
'th_end' : self.td_end,
'br_start' : self.br_start,
'meta_start' : self.meta_start}
[docs] def read(self, htmlfile, populator, path=None):
self.populator = populator
self.state = self.IGNORE
self.current_row = None
self.current_cell = None
for line in htmlfile.readlines():
self.feed(self._decode(line))
# Calling close is required by the HTMLParser but may cause problems
# if the same instance of our HtmlParser is reused. Currently it's
# used only once so there's no problem.
self.close()
if self.populator.eof():
LOGGER.warn("Using test data in HTML format is deprecated. "
"Convert '%s' to plain text format."
% (path or htmlfile.name))
def _decode(self, line):
return line.decode(self._encoding)
[docs] def handle_starttag(self, tag, attrs):
handler = self._handlers.get(tag+'_start')
if handler is not None:
handler(attrs)
[docs] def handle_endtag(self, tag):
handler = self._handlers.get(tag+'_end')
if handler is not None:
handler()
[docs] def handle_data(self, data):
if self.state == self.IGNORE or self.current_cell is None:
return
if NON_BREAKING_SPACE in data:
data = data.replace(NON_BREAKING_SPACE, ' ')
self.current_cell.append(data)
[docs] def handle_entityref(self, name):
value = self._handle_entityref(name)
self.handle_data(value)
def _handle_entityref(self, name):
if name == 'apos': # missing from entitydefs
return "'"
try:
value = entitydefs[name]
except KeyError:
return '&'+name+';'
if value.startswith('&#'):
return unichr(int(value[2:-1]))
if PY2:
return value.decode('ISO-8859-1')
return value
[docs] def handle_charref(self, number):
value = self._handle_charref(number)
self.handle_data(value)
def _handle_charref(self, number):
if number.startswith(('x', 'X')):
base = 16
number = number[1:]
else:
base = 10
try:
return unichr(int(number, base))
except ValueError:
return '&#'+number+';'
[docs] def unknown_decl(self, data):
# Ignore everything even if it's invalid. This kind of stuff comes
# at least from MS Excel
pass
[docs] def table_start(self, attrs=None):
self.state = self.INITIAL
self.current_row = None
self.current_cell = None
[docs] def table_end(self):
if self.current_row is not None:
self.tr_end()
self.state = self.IGNORE
[docs] def tr_start(self, attrs=None):
if self.current_row is not None:
self.tr_end()
self.current_row = []
[docs] def tr_end(self):
if self.current_row is None:
return
if self.current_cell is not None:
self.td_end()
if self.state == self.INITIAL:
accepted = self.populator.start_table(self.current_row)
self.state = self.PROCESS if accepted else self.IGNORE
elif self.state == self.PROCESS:
self.populator.add(self.current_row)
self.current_row = None
[docs] def td_start(self, attrs=None):
if self.current_cell is not None:
self.td_end()
if self.current_row is None:
self.tr_start()
self.current_cell = []
[docs] def td_end(self):
if self.current_cell is not None and self.state != self.IGNORE:
cell = ''.join(self.current_cell)
self.current_row.append(cell)
self.current_cell = None
[docs] def br_start(self, attrs=None):
self.handle_data('\n')
def _get_encoding_from_meta(self, attrs):
valid_http_equiv = False
encoding = None
for name, value in attrs:
name = name.lower()
if name == 'charset': # html5
return value
if name == 'http-equiv' and value.lower() == 'content-type':
valid_http_equiv = True
if name == 'content':
encoding = self._get_encoding_from_content_attr(value)
return encoding if valid_http_equiv else None
def _get_encoding_from_content_attr(self, value):
for token in value.split(';'):
token = token.strip()
if token.lower().startswith('charset='):
return token[8:]
[docs] def handle_pi(self, data):
encoding = self._get_encoding_from_pi(data)
if encoding:
self._encoding = encoding
def _get_encoding_from_pi(self, data):
data = data.strip()
if not data.lower().startswith('xml '):
return None
if data.endswith('?'):
data = data[:-1]
for token in data.split():
if token.lower().startswith('encoding='):
encoding = token[9:]
if encoding.startswith("'") or encoding.startswith('"'):
encoding = encoding[1:-1]
return encoding
return None