542 lines
17 KiB
Python
542 lines
17 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright © 2012-2013, 2015-2016 marmuta <marmvta@gmail.com>
|
|
#
|
|
# This file is part of Onboard.
|
|
#
|
|
# Onboard is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# Onboard is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
from __future__ import division, print_function, unicode_literals
|
|
|
|
import time
|
|
|
|
### Logging ###
|
|
import logging
|
|
_logger = logging.getLogger("TextChanges")
|
|
###############
|
|
|
|
|
|
class TextSpan:
|
|
"""
|
|
Span of text
|
|
|
|
Doctests:
|
|
>>> span = TextSpan(3, 2, "0123456789")
|
|
>>> span.get_span_text()
|
|
'34'
|
|
"""
|
|
|
|
def __init__(self, pos = 0, length = 0, text = "", text_pos = 0):
|
|
self.pos = pos # document caret position
|
|
self.length = length # span length
|
|
self.text = text # text that includes span, but may be larger
|
|
self.text_pos = text_pos # document position of text begin
|
|
self.last_modified = None
|
|
|
|
def copy(self):
|
|
return TextSpan(self.pos, self.length, self.text, self.text_pos)
|
|
|
|
def begin(self):
|
|
return self.pos
|
|
|
|
def end(self):
|
|
return self.pos + self.length
|
|
|
|
def text_begin(self):
|
|
return self.text_pos
|
|
|
|
def is_empty(self):
|
|
return self.length == 0
|
|
|
|
def contains(self, pos):
|
|
return self.pos <= pos < self.pos + self.length
|
|
|
|
def intersects(self, span):
|
|
return not self.intersection(span).is_empty()
|
|
|
|
def intersection(self, span):
|
|
p0 = max(self.pos, span.pos)
|
|
p1 = min(self.pos + self.length, span.pos + span.length)
|
|
if p0 > p1:
|
|
return TextSpan()
|
|
else:
|
|
return TextSpan(p0, p1 - p0)
|
|
|
|
def union_inplace(self, span):
|
|
"""
|
|
Join two spans, result in self.
|
|
|
|
Doctests:
|
|
- adjacent spans
|
|
>>> a = TextSpan(2, 3, "0123456789")
|
|
>>> b = TextSpan(5, 2, "0123456789")
|
|
>>> a.union_inplace(b) # doctest: +ELLIPSIS
|
|
TextSpan(2, 5, '23456', ...
|
|
>>> a.get_text()
|
|
'0123456789'
|
|
|
|
- intersecting spans
|
|
>>> a = TextSpan(2, 3, "0123456789")
|
|
>>> b = TextSpan(4, 2, "0123456789")
|
|
>>> a.union_inplace(b) # doctest: +ELLIPSIS
|
|
TextSpan(2, 4, '2345', ...
|
|
>>> a.get_text()
|
|
'0123456789'
|
|
"""
|
|
begin = min(self.begin(), span.begin())
|
|
end = max(self.end(), span.end())
|
|
length = end - begin
|
|
middle = length // 2
|
|
self.text = self.text[:middle - self.text_pos] + \
|
|
span.text[middle - span.text_pos:]
|
|
self.pos = begin
|
|
self.length = length
|
|
self.last_modified = max(self.last_modified if self.last_modified else 0,
|
|
span.last_modified if span.last_modified else 0)
|
|
return self
|
|
|
|
def get_text(self, begin = None, end = None):
|
|
""" Return the whole available text """
|
|
if begin is None and end is None:
|
|
return self.text
|
|
|
|
if begin is None:
|
|
begin = self.pos
|
|
if end is None:
|
|
end = self.end()
|
|
|
|
return self.text[begin - self.text_pos : end - self.text_pos]
|
|
|
|
def get_span_text(self):
|
|
""" Return just the span's part of the text. """
|
|
return self.get_text(self.pos, self.end())
|
|
|
|
def get_text_until_span(self):
|
|
"""
|
|
Return the beginning of the whole available text,
|
|
ending with and including the span.
|
|
|
|
Doctests:
|
|
>>> span = TextSpan(3, 2, "0123456789")
|
|
>>> span.get_text_until_span()
|
|
'01234'
|
|
"""
|
|
return self.text[:self.end() - self.text_pos]
|
|
|
|
def get_text_from_span(self):
|
|
"""
|
|
Return the end of the whole available text,
|
|
starting from and including the span.
|
|
|
|
Doctests:
|
|
>>> span = TextSpan(3, 2, "0123456789")
|
|
>>> span.get_text_from_span()
|
|
'3456789'
|
|
"""
|
|
return self.text[self.pos - self.text_pos:]
|
|
|
|
def get_text_after_span(self):
|
|
"""
|
|
Return the remaining available text after the span.
|
|
|
|
Doctests:
|
|
>>> span = TextSpan(3, 2, "0123456789")
|
|
>>> span.get_text_after_span()
|
|
'56789'
|
|
"""
|
|
return self.text[self.end() - self.text_pos:]
|
|
|
|
def get_char_before_span(self):
|
|
"""
|
|
Character right before the span.
|
|
|
|
Doctests:
|
|
>>> span = TextSpan(0, 0, "0123456789", 0)
|
|
>>> span.get_char_before_span()
|
|
''
|
|
|
|
>>> span = TextSpan(9, 1, "0123456789", 0)
|
|
>>> span.get_char_before_span()
|
|
'8'
|
|
|
|
>>> span = TextSpan(5, 2, "3456789", 3)
|
|
>>> span.get_char_before_span()
|
|
'4'
|
|
"""
|
|
pos = self.pos - self.text_pos
|
|
return self.text[pos - 1 : pos]
|
|
|
|
def get_last_char_in_span(self):
|
|
"""
|
|
Character right before the span.
|
|
|
|
Doctests:
|
|
>>> span = TextSpan(0, 0, "0123456789", 0)
|
|
>>> span.get_last_char_in_span()
|
|
''
|
|
|
|
>>> span = TextSpan(9, 1, "0123456789", 0)
|
|
>>> span.get_last_char_in_span()
|
|
'9'
|
|
|
|
>>> span = TextSpan(5, 2, "3456789", 3)
|
|
>>> span.get_last_char_in_span()
|
|
'6'
|
|
"""
|
|
pos = self.end() - self.text_pos
|
|
return self.text[pos - 1 : pos]
|
|
|
|
def _escape(self, text):
|
|
return text.replace("\n", "\\n")
|
|
|
|
def __repr__(self):
|
|
return "TextSpan({}, {}, '{}', {}, {})" \
|
|
.format(self.pos, self.length,
|
|
self._escape(self.get_span_text()),
|
|
self.text_begin(),
|
|
self.last_modified)
|
|
|
|
|
|
class TextChanges:
|
|
__doc__ = """
|
|
Collection of text spans yet to be learned.
|
|
|
|
Example:
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 1) # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 1]]
|
|
|
|
Doctests:
|
|
# insert and extend span
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 1) # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 1]]
|
|
>>> c.insert(0, 1) # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 2]]
|
|
|
|
# extend at beginning and end
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 1); c.insert(1, 1); c.insert(0, 3) # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 5]]
|
|
|
|
# insert separated by at least one character -> multiple spans
|
|
>>> c = TextChanges()
|
|
>>> c.insert(1, 1); c.insert(0, 1) # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 1], [2, 1]]
|
|
|
|
# add and delete inside single span
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 9); # IGNORE_RESULT
|
|
>>> c.delete(2, 1); # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 8]]
|
|
|
|
# join spans when deleting
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 1); c.insert(2, 1) # IGNORE_RESULT
|
|
>>> c.delete(2, 1); # IGNORE_RESULT
|
|
>>> c.delete(1, 1); # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 1]]
|
|
|
|
# remove spans fully contained in the deleted range
|
|
>>> c = TextChanges()
|
|
>>> c.insert(2, 1); c.insert(4, 1) # IGNORE_RESULT
|
|
>>> c.delete(0, 5); # IGNORE_RESULT
|
|
>>> c.get_span_ranges()
|
|
[[0, 0]]
|
|
|
|
# partially delete span, with and without recording empty spans
|
|
# ins del res with res without
|
|
>>> tests = [ # deletion before span
|
|
... [[2, 3], [0, 5], [[0, 0]], [[0, 0]] ],
|
|
... [[3, 3], [0, 5], [[0, 1]], [[0, 1]] ],
|
|
... [[4, 3], [0, 5], [[0, 2]], [[0, 2]] ],
|
|
... [[5, 3], [0, 5], [[0, 3]], [[0, 3]] ],
|
|
... [[6, 3], [0, 5], [[0, 0], [1, 3]], [[1, 3]] ],
|
|
... # deletion after span
|
|
... [[0, 3], [4, 5], [[0, 3], [4, 0]], [[0, 3]] ],
|
|
... [[1, 3], [4, 5], [[1, 3]], [[1, 3]] ],
|
|
... [[2, 3], [4, 5], [[2, 2]], [[2, 2]] ],
|
|
... [[3, 3], [4, 5], [[3, 1]], [[3, 1]] ],
|
|
... # deletion completely inside of span
|
|
... [[4, 3], [4, 5], [[4, 0]], [[4, 0]] ],
|
|
... [[0, 9], [2, 3], [[0, 6]], [[0, 6]] ] ]
|
|
>>> for test in tests:
|
|
... c = TextChanges()
|
|
... _ = c.insert(*test[0]); _ = c.delete(test[1][0], test[1][1], True)
|
|
... if c.get_span_ranges() != test[2]:
|
|
... "test1: " + repr(test) + " result: " + repr(c.get_span_ranges())
|
|
... c = TextChanges()
|
|
... _ = c.insert(*test[0]); _ = c.delete(test[1][0], test[1][1], False)
|
|
... if c.get_span_ranges() != test[3]:
|
|
... "test2: " + repr(test) + " result: " + repr(c.get_span_ranges())
|
|
|
|
# insert excluded span, include_length=0 to always insert an empty span
|
|
# ins del result
|
|
>>> tests = [[[5, 5], [2, 3], [[2, 0], [8, 5]] ], # insert before span
|
|
... [[0, 5], [6, 3], [[0, 5], [6, 0]] ], # insert after span
|
|
... [[0, 5], [2, 3], [[0, 2], [5, 3]] ], # insert inside span
|
|
... [[0, 5], [3, 4], [[0, 3], [7, 2]] ] ] # insert at span end
|
|
>>> for test in tests:
|
|
... c = TextChanges()
|
|
... _= c.insert(*test[0]); _ = c.insert(test[1][0], test[1][1], 0)
|
|
... if c.get_span_ranges() != test[2]:
|
|
... "test: " + repr(test) + " result: " + repr(c.get_span_ranges())
|
|
|
|
""".replace('IGNORE_RESULT', 'doctest: +ELLIPSIS\n [...')
|
|
|
|
def __init__(self, spans = None):
|
|
self.clear()
|
|
if spans:
|
|
self._spans = spans
|
|
|
|
def clear(self):
|
|
self._spans = []
|
|
|
|
# some counts for book-keeping, not used by this class itself.
|
|
self.insert_count = 0
|
|
self.delete_count = 0
|
|
|
|
def is_empty(self):
|
|
return len(self._spans) == 0
|
|
|
|
def get_spans(self):
|
|
return self._spans
|
|
|
|
def remove_span(self, span):
|
|
self._spans.remove(span)
|
|
|
|
def get_change_count(self):
|
|
return self.insert_count + self.delete_count
|
|
|
|
def insert(self, pos, length, include_length = -1):
|
|
"""
|
|
Record insertion up to <include_length> characters,
|
|
counted from the start of the insertion. The remaining
|
|
inserted characters are excluded from spans. This may split
|
|
an existing span.
|
|
|
|
A small but non-zero <include_length> allows to skip over
|
|
possible whitespace at the start of the insertion and
|
|
will often result in including the very first word(s) for learning.
|
|
|
|
include_length = -1: include length
|
|
include_length = +n: include n
|
|
include_length = None: include nothing, don't record
|
|
zero length span either
|
|
"""
|
|
end = pos + length
|
|
spans_to_update = []
|
|
|
|
# shift all existing spans after position
|
|
for span in self._spans:
|
|
if span.pos > pos:
|
|
span.pos += length
|
|
spans_to_update.append(span)
|
|
|
|
if include_length == -1:
|
|
# include all of the insertion
|
|
span = self.find_span_at(pos)
|
|
if span:
|
|
span.length += length
|
|
else:
|
|
span = TextSpan(pos, length);
|
|
self._spans.append(span)
|
|
spans_to_update.append(span)
|
|
else:
|
|
# include the insertion up to include_length only
|
|
max_include = min(length, include_length or 0)
|
|
span = self.find_span_at(pos)
|
|
if span:
|
|
# cut existing span
|
|
old_length = span.length
|
|
span.length = pos - span.pos + max_include
|
|
spans_to_update.append(span)
|
|
|
|
# new span for the cut part
|
|
l = old_length - span.length
|
|
if l > 0 or \
|
|
l == 0 and include_length is None:
|
|
span2 = TextSpan(pos + length, l)
|
|
self._spans.append(span2)
|
|
spans_to_update.append(span2)
|
|
|
|
elif not include_length is None:
|
|
span = TextSpan(pos, max_include)
|
|
self._spans.append(span)
|
|
spans_to_update.append(span)
|
|
|
|
t = time.time()
|
|
for span in spans_to_update:
|
|
span.last_modified = t
|
|
|
|
if spans_to_update:
|
|
self.insert_count += 1
|
|
|
|
return spans_to_update
|
|
|
|
def delete(self, pos, length, record_empty_spans = True):
|
|
"""
|
|
Record deletion.
|
|
|
|
record_empty_spans = True: record extra zero length spans
|
|
at deletion point
|
|
record_empty_spans = False: no extra new spans, but keep existing ones
|
|
that become zero length (terminal scrolling)
|
|
"""
|
|
begin = pos
|
|
end = pos + length
|
|
spans_to_update = []
|
|
|
|
#from pudb import set_trace; set_trace()
|
|
|
|
# cut/remove existing spans
|
|
for span in list(self._spans):
|
|
if span.pos <= pos: # span begins before deletion point?
|
|
k = min(span.end() - begin, length) # intersecting length
|
|
if k >= 0:
|
|
span.length -= k
|
|
spans_to_update.append(span)
|
|
else: # span begins after deletion point
|
|
k = end - span.begin() # intersecting length
|
|
if k >= 0:
|
|
span.pos += k
|
|
span.length -= k
|
|
span.pos -= length # shift by deleted length
|
|
|
|
# remove spans fully contained in the deleted range
|
|
if span.length < 0:
|
|
self._spans.remove(span)
|
|
else:
|
|
spans_to_update.append(span)
|
|
|
|
# Add new empty span
|
|
if record_empty_spans:
|
|
span = self.find_span_excluding(pos)
|
|
if not span:
|
|
# Create empty span when deleting too, because this
|
|
# is still a change that can result in a word to learn.
|
|
span = TextSpan(pos, 0);
|
|
self._spans.append(span)
|
|
|
|
self._spans, span = self.consolidate_spans(self._spans, span)
|
|
spans_to_update.append(span)
|
|
|
|
if spans_to_update:
|
|
self.delete_count += 1
|
|
|
|
return spans_to_update
|
|
|
|
@staticmethod
|
|
def consolidate_spans(spans, tracked_span = None):
|
|
"""
|
|
join touching or intersecting text spans
|
|
|
|
Doctests:
|
|
# Join touching spans
|
|
>>> spans = [TextSpan(0, 1),
|
|
... TextSpan(2, 4),
|
|
... TextSpan(1, 1),
|
|
... TextSpan(10, 3),
|
|
... TextSpan(8, 2)]
|
|
>>> spans, _span = TextChanges.consolidate_spans(spans)
|
|
>>> TextChanges.to_span_ranges(spans)
|
|
[[0, 6], [8, 5]]
|
|
|
|
# Join overlapping spans
|
|
>>> spans = [TextSpan(2, 5),
|
|
... TextSpan(4, 10),
|
|
... TextSpan(12, 8)]
|
|
>>> spans, _span = TextChanges.consolidate_spans(spans)
|
|
>>> TextChanges.to_span_ranges(spans)
|
|
[[2, 18]]
|
|
|
|
# Join contained spans
|
|
>>> spans = [TextSpan(5, 1),
|
|
... TextSpan(2, 10),
|
|
... TextSpan(3, 4)]
|
|
>>> spans, _span = TextChanges.consolidate_spans(spans)
|
|
>>> TextChanges.to_span_ranges(spans)
|
|
[[2, 10]]
|
|
"""
|
|
spans = sorted(spans, key=lambda x: (x.begin(), x.end()))
|
|
new_spans = []
|
|
slast = None
|
|
for s in spans:
|
|
if slast and \
|
|
slast.end() >= s.begin():
|
|
slast.union_inplace(s)
|
|
if tracked_span is s:
|
|
tracked_span = slast
|
|
else:
|
|
new_spans.append(s)
|
|
slast = s
|
|
|
|
return new_spans, tracked_span
|
|
|
|
def find_span_at(self, pos):
|
|
"""
|
|
Doctests:
|
|
- find empty spans (text deleted):
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 0) # doctest: +ELLIPSIS
|
|
[TextSpan(...
|
|
>>> c.find_span_at(0) # doctest: +ELLIPSIS
|
|
TextSpan(0, 0,...
|
|
"""
|
|
for span in self._spans:
|
|
if span.pos <= pos <= span.pos + span.length:
|
|
return span
|
|
return None
|
|
|
|
def find_span_excluding(self, pos):
|
|
"""
|
|
Doctests:
|
|
- find empty spans (text deleted):
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 0) # doctest: +ELLIPSIS
|
|
[TextSpan(...
|
|
>>> c.find_span_excluding(0) # doctest: +ELLIPSIS
|
|
TextSpan(0, 0,...
|
|
|
|
- don't match the end
|
|
>>> c = TextChanges()
|
|
>>> c.insert(0, 1) # doctest: +ELLIPSIS
|
|
[TextSpan(...
|
|
>>> c.find_span_excluding(1) # doctest: +ELLIPSIS
|
|
|
|
"""
|
|
for span in self._spans:
|
|
if span.pos == pos or \
|
|
span.pos <= pos < span.pos + span.length:
|
|
return span
|
|
return None
|
|
|
|
def get_span_ranges(self):
|
|
return self.to_span_ranges(self._spans)
|
|
|
|
@staticmethod
|
|
def to_span_ranges(spans):
|
|
return sorted([[span.pos, span.length] for span in spans])
|
|
|
|
def __repr__(self):
|
|
return "TextChanges " + repr([str(span) for span in self._spans])
|
|
|