Parcourir la source

first commit. first working version

gogsadmin il y a 1 an
commit
bab81fc537
1 fichiers modifiés avec 261 ajouts et 0 suppressions
  1. 261 0
      translator.py

+ 261 - 0
translator.py

@@ -0,0 +1,261 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+import MySQLdb
+import re
+import sys
+import time
+import transliterate
+import openai
+
+blocks = ["p", "h1", "h2", "h3", "h4", "h5", "img", "li"]
+
+def generate_response(prompt):
+    openai.api_key = "OPENAI_API_KEY"
+    prompt = "Интересует только перевод с русского языка на болгарский язык. Если в оригинале встречается html разметка, то вставлять ее в соответствующих местах перевода. Если перевод совпадает с оригиналом, то вывести только перевод, пояснения о совпадении оригинала и перевода не требуются. Переведи: \"" + prompt + "\""
+
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "system", "content": prompt}],
+        temperature=0.3,
+        n=1,
+        stop=None,
+    )
+
+    if 'choices' in response and len(response.choices) > 0:
+        return response.choices[0].message.content
+    else:
+        return None
+
+def remove_tag(string):
+    pattern = r'^<[^>]+>|<[^>]+>$'
+    result = re.sub(pattern, '', string)
+    return result
+
+def insert_into_table_postmeta(post_id, meta_key, meta_value):
+    db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
+    cur = db.cursor()
+    sql = """
+    INSERT INTO wp_postmeta(meta_id, post_id, meta_key, meta_value) 
+    VALUES (NULL,'%i','%s','%s')
+    """ % (post_id, meta_key, meta_value)
+    cur.execute(sql)
+    db.commit()
+    db.close()
+
+def insert_into_table_postsTranslated(ID, post_modified):
+    db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
+    cur = db.cursor()
+    sql = """
+    INSERT INTO wp_posts_translated(ID, post_modified) 
+    VALUES ('%i','%s')
+    """ % (ID, post_modified)
+    cur.execute(sql)
+    db.commit()
+    db.close()
+
+def _extract_blocks(parent_tag) -> list:
+    extracted_blocks = []
+    for tag in parent_tag:
+        if tag.name in blocks:
+            if (tag.name == "img") and ("alt" in tag.attrs):
+                if tag["alt"]:
+                    response = generate_response(tag["alt"])
+                    if response is not None:
+                        tag["alt"] = response
+                        time.sleep(25)
+                    else:
+                        sys.exit()
+            else:
+                response = generate_response(remove_tag(str(tag)))
+                if response is not None:
+                    s1 = BeautifulSoup(response, 'html.parser')
+                    tag.clear()
+                    tag.append(s1)
+                    time.sleep(25)
+                else:
+                    sys.exit()
+                extracted_blocks.append(tag)
+            continue
+        if isinstance(tag, Tag):
+            if len(tag.contents) > 0:
+                inner_blocks = _extract_blocks(tag)
+                if len(inner_blocks) > 0:
+                    extracted_blocks.extend(inner_blocks)
+    return extracted_blocks
+
+if __name__ == "__main__":
+    pattern_bg = r's:2:"bg";i:(\d+);'
+
+    db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
+    cur = db.cursor()
+
+    sql = """
+    SELECT * FROM wp_posts WHERE (post_status='publish') AND (post_type='post') AND (ID NOT IN (SELECT ID FROM wp_posts_translated)) ORDER BY ID ASC LIMIT 1;
+    """
+    cur.execute(sql)
+    row = cur.fetchone()
+    if row is not None:
+        soup = BeautifulSoup(row[4], features="lxml")
+        extracted_blocks = _extract_blocks(soup.body)
+        post_content = re.sub(r'(<html><body>|</body></html>)', '', str(soup))
+
+        post_title = ""
+        response = generate_response(row[5])
+        if response is not None:
+            post_title = response
+            time.sleep(25)
+        else:
+            sys.exit()
+
+        post_name = transliterate.translit(post_title, 'ru', reversed=True)
+        post_name = post_name.lower()
+        post_name = re.sub(r'[^a-z,0-9,-, ]', '', post_name)
+        post_name = re.sub(r' ', '-', post_name)
+
+        guid = ""
+
+        #####################
+        # TABLE posts
+        #####################
+        db = MySQLdb.connect(host="DB_SERVER", user="DB_USER", passwd="DB_PASSWORD", db="DB_NAME", charset = 'DB_CHARSET')
+        cur = db.cursor()
+        sql = """
+        INSERT INTO wp_posts(ID, post_author, post_date, post_date_gmt, post_content, post_title, post_excerpt, post_status, comment_status, ping_status, post_password, post_name, to_ping, pinged, post_modified, post_modified_gmt, post_content_filtered, post_parent, guid, menu_order, post_type, post_mime_type, comment_count)
+        VALUES (NULL,'%s',NOW(),UTC_TIMESTAMP(),'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',NOW(),UTC_TIMESTAMP(),'%s','%s','%s','%s','%s','%s',0)
+        """  % (row[1], MySQLdb.escape_string(post_content).decode('utf-8'), MySQLdb.escape_string(post_title).decode('utf-8'), row[6], row[7], row[8], row[9], row[10], post_name, row[12], row[13], row[16], row[17], guid, row[19], row[20], row[21])
+        cur.execute(sql)
+        db.commit()
+        sql = "SELECT LAST_INSERT_ID();"
+        cur.execute(sql)
+        tb_posts_id = cur.fetchone()
+        if tb_posts_id is not None:
+            guid = "https://adminkin.com/?p=%i" % tb_posts_id[0]
+            sql = """
+            UPDATE wp_posts SET guid='%s' WHERE ID='%i'
+            """ % (guid, tb_posts_id[0])
+            cur.execute(sql)
+            db.commit()
+
+            ###############################
+            # TABLE term_taxonomy
+            ###############################
+            sql = """
+            INSERT INTO wp_term_taxonomy(term_taxonomy_id, term_id, taxonomy, description, parent, count)
+            VALUES (NULL,0,'post_translations','a:2:{s:2:"bg";i:%i;s:2:"ru";i:%i;}',0,2)
+            """ % (tb_posts_id[0], row[0])
+            cur.execute(sql)
+            db.commit()
+            sql = "SELECT LAST_INSERT_ID();"
+            cur.execute(sql)
+            tb_term_taxonomy_id = cur.fetchone()
+            if tb_term_taxonomy_id is not None:
+                sql = """
+                UPDATE wp_term_taxonomy SET term_id='%i' WHERE term_taxonomy_id='%i'
+                """ % (tb_term_taxonomy_id[0], tb_term_taxonomy_id[0])
+                cur.execute(sql)
+                db.commit()
+                sql = """
+                INSERT INTO wp_terms(term_id, name, slug, term_group) 
+                VALUES ('%i','pll_6111111111111','pll_6111111111111',0)
+                """ % (tb_term_taxonomy_id[0])
+                cur.execute(sql)
+                db.commit()
+                ################################
+                # TABLE term_relationships
+                ################################
+                category_id_bg = 107
+                sql = """
+                SELECT wp_term_relationships.*, wp_term_taxonomy.taxonomy FROM wp_term_relationships LEFT JOIN wp_term_taxonomy
+                ON wp_term_relationships.term_taxonomy_id=wp_term_taxonomy.term_taxonomy_id WHERE wp_term_relationships.object_id='%i'
+                """ % row[0]
+                cur.execute(sql)
+                for tb_term_relationships in cur.fetchall():
+                    if (tb_term_relationships[3] == 'category') or (tb_term_relationships[3] == 'post_tag'):
+                        pattern_ru = r's:2:"ru";i:%i;' % tb_term_relationships[1]
+                        sql = """
+                        SELECT * FROM wp_term_taxonomy WHERE taxonomy LIKE 'term_translations'
+                        """
+                        cur.execute(sql)
+                        for res in cur.fetchall():
+                            search_result = re.search(pattern_ru, res[3])
+                            if search_result:
+                                search_result = re.search(pattern_bg, res[3])
+                                if search_result:
+                                    id_bg = int(search_result.group(1))
+                                    sql = """
+                                    INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
+                                    VALUES ('%i','%s','%s')
+                                    """ % (tb_posts_id[0], id_bg, tb_term_relationships[2])
+                                    cur.execute(sql)
+                                    db.commit()
+
+                                    sql = """
+                                    UPDATE wp_term_taxonomy SET count=count+1 WHERE term_taxonomy_id='%s' 
+                                    """ % (id_bg)
+                                    cur.execute(sql)
+                                    db.commit()
+
+                                    if (tb_term_relationships[3] == 'category'):
+                                        category_id_bg = id_bg
+                                    break
+                    elif (tb_term_relationships[3] == 'language'):
+                        language_id_bg = 105
+                        sql = """
+                        INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
+                        VALUES ('%i','%i','%s')
+                        """ % (tb_posts_id[0], language_id_bg, tb_term_relationships[2])
+                        cur.execute(sql)
+                        db.commit()
+
+                        sql = """
+                        UPDATE wp_term_taxonomy SET count=count+1 WHERE term_taxonomy_id='%i' 
+                        """ % (language_id_bg)
+                        cur.execute(sql)
+                        db.commit()
+                sql = """
+                INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
+                VALUES ('%i','%i',0)
+                """ % (tb_posts_id[0], tb_term_taxonomy_id[0])
+                cur.execute(sql)
+                db.commit()
+                sql = """
+                INSERT INTO wp_term_relationships(object_id, term_taxonomy_id, term_order)
+                VALUES ('%i','%i',0)
+                """ % (row[0], tb_term_taxonomy_id[0])
+                cur.execute(sql)
+                db.commit()
+
+                ########################
+                # TABLE postmeta
+                ########################
+                sql = """
+                SELECT * FROM wp_postmeta WHERE post_id = '%i'
+                """ % row[0]
+                cur.execute(sql)
+                for tb_postmeta in cur.fetchall():
+                    meta_value = ""
+                    if (tb_postmeta[2] == '_yoast_wpseo_focuskw') or (tb_postmeta[2] == '_yoast_wpseo_title'):
+                        response = generate_response(tb_postmeta[3])
+                        if response is not None:
+                            meta_value = response
+                            time.sleep(25)
+                            meta_value = MySQLdb.escape_string(meta_value).decode('utf-8')
+                        else:
+                            time.sleep(25)
+                            continue
+                    elif (tb_postmeta[2] == '_original_post'):
+                        meta_value = "https://adminkin.com/?p=%i" % tb_posts_id[0]
+                    elif (tb_postmeta[2] == '_yoast_wpseo_primary_category'):
+                        meta_value = "%i" % category_id_bg
+                    else:
+                        meta_value = MySQLdb.escape_string(tb_postmeta[3]).decode('utf-8')
+
+                    insert_into_table_postmeta(tb_posts_id[0], tb_postmeta[2], meta_value)
+                ##############################
+                # TABLE posts_translated
+                ##############################
+                insert_into_table_postsTranslated(row[0], row[14])
+    db.close()