from googletrans import Translator
from multiprocessing import Pool
from bs4 import BeautifulSoup
from time import sleep
from os import listdir, walk
DIRECTORY = 'C:\\Users\\GENERAL\\Desktop\\quests\\'
# Better documentation + check output folder if file already exists, if so skip
html_folder_path = DIRECTORY
output_folder_path = DIRECTORY
def run():
html_paths = []
for root, dirs, files in walk(DIRECTORY):
for name in files:
if name.endswith('.htm'):
html_paths.append(root + '\\' + name)
p = Pool(5)
p.map(trans, html_paths)
def trans(html_path):
# Sleep to prevent google ip ban
sleep(1)
# Initialize class and all_text will contain our translated text
translator = Translator()
all_text = []
# Read HTML file
output_path = output_folder_path + html_path[len(html_folder_path):]
try:
file_data = open(html_path, encoding="utf-8").read() # try latin1
soup = BeautifulSoup(file_data, 'lxml')
except UnicodeDecodeError:
file_data = open(html_path, encoding="latin1").read() # try latin1
soup = BeautifulSoup(file_data, 'lxml')
# Translate text
all_ns = soup.body.findAll(text=True)
for element in all_ns:
text = element
text = text.replace(u"\u2026", "")
text = text.replace(u"\xa0", "")
text = text.replace(u"\x85", "")
text = text.replace(u"\xa0", "")
if text:
all_text.append(text)
all_text = translator.translate(all_text, dest="ru")
# Replace text in html file
for i in range(len(all_ns)):
element = all_ns
element.replace_with(all_text.text)
with open(output_path, "wb+") as file:
file.write(soup.prettify("utf-8"))
with open(html_path, 'r', encoding='utf-8') as f:
txt = f.read().replace('<br/>', '<br>')
with open(html_path, 'w', encoding='utf-8') as f:
f.write(txt)
if __name__ == "__main__":
run()
print("Complete")