Commit 091bce2e authored by Jakob Lerch's avatar Jakob Lerch
Browse files

add main algorithm

parent d86fe44d
[main]
active_set_size = 2
header_size =10
header_size = 10
[translation]
service = libretranslate
......
import json
import time
from configparser import ConfigParser
from pathlib import Path
from difflib import Differ
from difflib import SequenceMatcher
from itertools import takewhile
from math import ceil
from pathlib import Path
import atexit
import requests
......@@ -14,9 +16,10 @@ from translation import DeepL, LibreTranslate, Translation
# TODO: scheint nur bei fem-instanz zu funktionieren. bei libretranslate.de kackt das ab
# TODO: implement possibility to execute this package for multiple pairs of protocols (see matterbridge)
# TODO: work on appending text:
# idea: 2 states:
# * detection of active line (using diff)
# * translation of active line
# write termination message on exit
atexit.register(lambda: dst_pad.write(
"[PROTOCOL TRANSLATION HAS BEEN TERMINATED]\n" + dst_pad.read()))
if __name__ == "__main__":
# read config files
......@@ -28,7 +31,6 @@ if __name__ == "__main__":
config.read(Path(__file__).parent.parent / "config.conf")
# read main
active_set_size = int(config["main"]["active_set_size"])
header_size = int(config["main"]["header_size"])
# create src pad
......@@ -71,23 +73,88 @@ if __name__ == "__main__":
raise ValueError(
"config.conf: 'service' in [translation] is not valid")
# do translation
# ..initial translation
# initial translation
dst_pad.write("initializing...")
initial_text = src_pad.read().split('\n')
header = '\n'.join(initial_text[0:header_size])
body = '\n'.join(initial_text[header_size:])
header = initial_text[0:header_size]
body = initial_text[header_size:]
# ..only include line break after header if header size > 0
joined_header = '\n'.join(header) + '\n' if len(header) > 0 else ""
translated_body = t.translate(body, src_lang, dst_lang)
dst_pad.write(header + '\n' + translated_body)
# ..strip leading lines consisting of whitespace
# ..add them when writing to dst_pad
# ..this has to be done, because libretranslate ignores leading whitespace,
# ..leading to incorrect order of the translated lines
leading_space_lines = list(
takewhile(lambda line: line.isspace() or not line, body))
# ..only include line break after lines if size > 0
joined_leading_space_lines = '\n'.join(
leading_space_lines) + '\n' if len(leading_space_lines) > 0 else ""
body = body[len(leading_space_lines):]
joined_translated_body = t.translate(
'\n'.join(body), src_lang, dst_lang)
dst_pad.write(joined_header + joined_leading_space_lines +
joined_translated_body)
# ..further translation
d = Differ()
s = SequenceMatcher()
translated_body = joined_translated_body.split('\n')
while True:
# TODO: it is assumed, that libretranslate definitely translates the protocol line by line
# this is to be tested
# [old_]body and [old_]translated_body are lists of lines
old_body = body
body = src_pad.read()[header_size:]
# TODO: use diff to find changes of protocol, extract them, translate them, insert them, post them
# read body (without header)
body = src_pad.read().split('\n')[header_size:]
# again, handle leading space lines
leading_space_lines = list(
takewhile(lambda line: line.isspace() or not line, body))
joined_leading_space_lines = '\n'.join(
leading_space_lines) + '\n' if len(leading_space_lines) > 0 else ""
body = body[len(leading_space_lines):]
# sequence matcher get_opcodes
s.set_seqs(old_body, body)
opcodes = s.get_opcodes()
# this line does a couple of things:
# ..create a generator
# ..add all elements of 'opcodes' to it except those with the 'equal'-tag
# ..compute the text in the lines that have been changed and translate it
# ..except for the case, that the tag is 'delete' which is trivial
translated_opcodes = (
(e[0], e[1], e[2], e[3], e[4], (lambda: t.translate('\n'.join(body[e[3]:e[4]]), src_lang, dst_lang).split('\n') if e[0] != "delete" else None)()) for e in opcodes if e[0] != "equal")
# insert the translated text into the old_translated_body, thus creating translated_body
added_lines = 0
for e in translated_opcodes:
if e[0] == "delete":
del translated_body[e[1] +
added_lines: e[2] + added_lines]
added_lines -= e[2] - e[1]
elif e[0] == "replace":
translated_body[e[1] +
added_lines: e[2] + added_lines] = e[5]
elif e[0] == "insert":
translated_body[e[1] +
added_lines: e[2] + added_lines] = e[5]
added_lines += e[4] - e[3]
else:
assert False
# dst_pad.write(src_pad.read())
pass
# write
dst_pad.write(joined_header + joined_leading_space_lines +
'\n'.join(translated_body))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment