translator.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #!/usr/bin/python3
  2. # coding: utf-8
  3. from bs4 import BeautifulSoup
  4. from bs4.element import Tag
  5. import MySQLdb
  6. import re
  7. import sys
  8. import time
  9. import transliterate
  10. import openai
  11. blocks = ["p", "h1", "h2", "h3", "h4", "h5", "img", "li"]
  12. def href_rus_to_bg(db, cur, pattern_bg, text):
  13. pattern_url = r'https://adminkin.com/([^/]+)'
  14. for rus_postname in re.findall(pattern_url, text):
  15. sql = """
  16. SELECT wp_term_taxonomy.taxonomy, wp_term_taxonomy.description FROM wp_posts
  17. LEFT JOIN wp_term_relationships ON wp_posts.ID=wp_term_relationships.object_id
  18. LEFT JOIN wp_term_taxonomy ON wp_term_relationships.term_taxonomy_id=wp_term_taxonomy.term_taxonomy_id
  19. WHERE wp_posts.post_name LIKE '%s';
  20. """ % rus_postname
  21. cur.execute(sql)
  22. for row in cur.fetchall():
  23. if (row[0] == 'post_translations'):
  24. search_result = re.search(pattern_bg, row[1])
  25. if search_result:
  26. bg_postid = int(search_result.group(1))
  27. sql = "SELECT post_name FROM wp_posts WHERE ID = '%i'" % bg_postid
  28. cur.execute(sql)
  29. row2 = cur.fetchone()
  30. if row2 is not None:
  31. bg_postname = "bg/%s" % row2[0]
  32. text = re.sub(rus_postname, bg_postname, text)
  33. break
  34. return text
  35. def generate_response(prompt):
  36. openai.api_key = "OPENAI_API_KEY"
  37. prompt = "Требуется только перевод с русского языка на болгарский язык. Если в оригинале есть английские слова, то они должны присутствовать в переводе. Если в оригинале встречается html разметка, то вставлять её в соответствующих местах перевода. Никакие пояснения к переводу не требуются. Переведи: \"" + prompt + "\""
  38. response = openai.ChatCompletion.create(
  39. model="gpt-3.5-turbo",
  40. messages=[{"role": "system", "content": prompt}],
  41. temperature=0.3,
  42. n=1,
  43. stop=None,
  44. )
  45. if 'choices' in response and len(response.choices) > 0:
  46. text = response.choices[0].message.content
  47. if text.startswith('"'):
  48. text = text[1:]
  49. if text.endswith('"'):
  50. text = text[:-1]
  51. return text
  52. else:
  53. return None
  54. def remove_tag(string):
  55. pattern = r'^<[^>]+>|<[^>]+>$'
  56. result = re.sub(pattern, '', string)
  57. return result
  58. def insert_into_table_postmeta(post_id, meta_key, meta_value):
  59. db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
  60. cur = db.cursor()
  61. sql = """
  62. INSERT INTO wp_postmeta(meta_id, post_id, meta_key, meta_value)
  63. VALUES (NULL,'%i','%s','%s')
  64. """ % (post_id, meta_key, meta_value)
  65. cur.execute(sql)
  66. db.commit()
  67. db.close()
  68. def insert_into_table_postsTranslated(ID, post_modified):
  69. db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
  70. cur = db.cursor()
  71. sql = """
  72. INSERT INTO wp_posts_translated(ID, post_modified)
  73. VALUES ('%i','%s')
  74. """ % (ID, post_modified)
  75. cur.execute(sql)
  76. db.commit()
  77. db.close()
  78. def _extract_blocks(parent_tag) -> list:
  79. extracted_blocks = []
  80. for tag in parent_tag:
  81. if tag.name in blocks:
  82. if (tag.name == "img") and ("alt" in tag.attrs):
  83. if tag["alt"]:
  84. response = generate_response(tag["alt"])
  85. if response is not None:
  86. tag["alt"] = response
  87. time.sleep(25)
  88. else:
  89. sys.exit()
  90. else:
  91. response = generate_response(remove_tag(str(tag)))
  92. if response is not None:
  93. s1 = BeautifulSoup(response, 'html.parser')
  94. tag.clear()
  95. tag.append(s1)
  96. time.sleep(25)
  97. else:
  98. sys.exit()
  99. extracted_blocks.append(tag)
  100. continue
  101. if isinstance(tag, Tag):
  102. if len(tag.contents) > 0:
  103. inner_blocks = _extract_blocks(tag)
  104. if len(inner_blocks) > 0:
  105. extracted_blocks.extend(inner_blocks)
  106. return extracted_blocks
  107. if __name__ == "__main__":
  108. pattern_bg = r's:2:"bg";i:(\d+);'
  109. db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
  110. cur = db.cursor()
  111. sql = """
  112. SELECT * FROM wp_posts WHERE (post_status='publish') AND (post_type='post') AND (ID NOT IN (SELECT ID FROM wp_posts_translated)) ORDER BY ID ASC LIMIT 1;
  113. """
  114. cur.execute(sql)
  115. row = cur.fetchone()
  116. if row is not None:
  117. text = href_rus_to_bg(db, cur, pattern_bg, row[4])
  118. soup = BeautifulSoup(text, features="lxml")
  119. extracted_blocks = _extract_blocks(soup.body)
  120. post_content = re.sub(r'(<html><body>|</body></html>)', '', str(soup))
  121. post_title = ""
  122. response = generate_response(row[5])
  123. if response is not None:
  124. post_title = response
  125. time.sleep(25)
  126. else:
  127. sys.exit()
  128. post_name = transliterate.translit(post_title, 'ru', reversed=True)
  129. post_name = post_name.lower()
  130. post_name = re.sub(r'[^a-z0-9\- ]', '', post_name)
  131. post_name = re.sub(r' ', '-', post_name)
  132. guid = ""
  133. #####################
  134. # TABLE posts
  135. #####################
  136. db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
  137. cur = db.cursor()
  138. sql = """
  139. INSERT INTO wp_posts(ID, post_author, post_date, post_date_gmt, post_content, post_title, post_excerpt, post_status, comment_status, ping_status, post_password, post_name, to_ping, pinged, post_modified, post_modified_gmt, post_content_filtered, post_parent, guid, menu_order, post_type, post_mime_type, comment_count)
  140. VALUES (NULL,'%s',NOW(),UTC_TIMESTAMP(),'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',NOW(),UTC_TIMESTAMP(),'%s','%s','%s','%s','%s','%s',0)
  141. """ % (row[1], MySQLdb.escape_string(post_content).decode('utf-8'), MySQLdb.escape_string(post_title).decode('utf-8'), row[6], row[7], row[8], row[9], row[10], post_name, row[12], row[13], row[16], row[17], guid, row[19], row[20], row[21])
  142. cur.execute(sql)
  143. db.commit()
  144. sql = "SELECT LAST_INSERT_ID();"
  145. cur.execute(sql)
  146. tb_posts_id = cur.fetchone()
  147. if tb_posts_id is not None:
  148. guid = "https://adminkin.com/?p=%i" % tb_posts_id[0]
  149. sql = """
  150. UPDATE wp_posts SET guid='%s' WHERE ID='%i'
  151. """ % (guid, tb_posts_id[0])
  152. cur.execute(sql)
  153. db.commit()
  154. ###############################
  155. # TABLE term_taxonomy
  156. ###############################
  157. sql = """
  158. INSERT INTO wp_term_taxonomy(term_taxonomy_id, term_id, taxonomy, description, parent, count)
  159. VALUES (NULL,0,'post_translations','a:2:{s:2:"bg";i:%i;s:2:"ru";i:%i;}',0,2)
  160. """ % (tb_posts_id[0], row[0])
  161. cur.execute(sql)
  162. db.commit()
  163. sql = "SELECT LAST_INSERT_ID();"
  164. cur.execute(sql)
  165. tb_term_taxonomy_id = cur.fetchone()
  166. if tb_term_taxonomy_id is not None:
  167. sql = """
  168. UPDATE wp_term_taxonomy SET term_id='%i' WHERE term_taxonomy_id='%i'
  169. """ % (tb_term_taxonomy_id[0], tb_term_taxonomy_id[0])
  170. cur.execute(sql)
  171. db.commit()
  172. sql = """
  173. INSERT INTO wp_terms(term_id, name, slug, term_group)
  174. VALUES ('%i','pll_6111111111111','pll_6111111111111',0)
  175. """ % (tb_term_taxonomy_id[0])
  176. cur.execute(sql)
  177. db.commit()
  178. ################################
  179. # TABLE term_relationships
  180. ################################
  181. category_id_bg = 107
  182. sql = """
  183. SELECT wp_term_relationships.*, wp_term_taxonomy.taxonomy FROM wp_term_relationships LEFT JOIN wp_term_taxonomy
  184. ON wp_term_relationships.term_taxonomy_id=wp_term_taxonomy.term_taxonomy_id WHERE wp_term_relationships.object_id='%i'
  185. """ % row[0]
  186. cur.execute(sql)
  187. for tb_term_relationships in cur.fetchall():
  188. if (tb_term_relationships[3] == 'category') or (tb_term_relationships[3] == 'post_tag'):
  189. pattern_ru = r's:2:"ru";i:%i;' % tb_term_relationships[1]
  190. sql = """
  191. SELECT * FROM wp_term_taxonomy WHERE taxonomy LIKE 'term_translations'
  192. """
  193. cur.execute(sql)
  194. for res in cur.fetchall():
  195. search_result = re.search(pattern_ru, res[3])
  196. if search_result:
  197. search_result = re.search(pattern_bg, res[3])
  198. if search_result:
  199. id_bg = int(search_result.group(1))
  200. sql = """
  201. INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
  202. VALUES ('%i','%s','%s')
  203. """ % (tb_posts_id[0], id_bg, tb_term_relationships[2])
  204. cur.execute(sql)
  205. db.commit()
  206. sql = """
  207. UPDATE wp_term_taxonomy SET count=count+1 WHERE term_taxonomy_id='%s'
  208. """ % (id_bg)
  209. cur.execute(sql)
  210. db.commit()
  211. if (tb_term_relationships[3] == 'category'):
  212. category_id_bg = id_bg
  213. break
  214. elif (tb_term_relationships[3] == 'language'):
  215. language_id_bg = 105
  216. sql = """
  217. INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
  218. VALUES ('%i','%i','%s')
  219. """ % (tb_posts_id[0], language_id_bg, tb_term_relationships[2])
  220. cur.execute(sql)
  221. db.commit()
  222. sql = """
  223. UPDATE wp_term_taxonomy SET count=count+1 WHERE term_taxonomy_id='%i'
  224. """ % (language_id_bg)
  225. cur.execute(sql)
  226. db.commit()
  227. sql = """
  228. INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
  229. VALUES ('%i','%i',0)
  230. """ % (tb_posts_id[0], tb_term_taxonomy_id[0])
  231. cur.execute(sql)
  232. db.commit()
  233. sql = """
  234. INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
  235. VALUES ('%i','%i',0)
  236. """ % (row[0], tb_term_taxonomy_id[0])
  237. cur.execute(sql)
  238. db.commit()
  239. ########################
  240. # TABLE postmeta
  241. ########################
  242. sql = """
  243. SELECT * FROM wp_postmeta WHERE post_id = '%i'
  244. """ % row[0]
  245. cur.execute(sql)
  246. for tb_postmeta in cur.fetchall():
  247. meta_value = ""
  248. if (tb_postmeta[2] == '_yoast_wpseo_focuskw') or ((tb_postmeta[2] == '_yoast_wpseo_title') and (tb_postmeta[3] != '%%title%%')):
  249. response = generate_response(tb_postmeta[3])
  250. if response is not None:
  251. meta_value = response
  252. time.sleep(25)
  253. meta_value = MySQLdb.escape_string(meta_value).decode('utf-8')
  254. else:
  255. time.sleep(25)
  256. continue
  257. elif (tb_postmeta[2] == '_original_post'):
  258. meta_value = "https://adminkin.com/?p=%i" % tb_posts_id[0]
  259. elif (tb_postmeta[2] == '_yoast_wpseo_primary_category'):
  260. meta_value = "%i" % category_id_bg
  261. else:
  262. meta_value = MySQLdb.escape_string(tb_postmeta[3]).decode('utf-8')
  263. insert_into_table_postmeta(tb_posts_id[0], tb_postmeta[2], meta_value)
  264. ##############################
  265. # TABLE posts_translated
  266. ##############################
  267. insert_into_table_postsTranslated(row[0], row[14])
  268. insert_into_table_postsTranslated(tb_posts_id[0], row[14])
  269. db.close()