From b1d6e4e57b08f2d98f072c252863fa30f7784d8b Mon Sep 17 00:00:00 2001 From: Mufeed Ali Date: Mon, 11 Oct 2021 21:40:21 +0530 Subject: [PATCH] Use CLDR for lanuage names --- .gitignore | 1 + lang_update/README.md | 8 ++ lang_update/lang_update.py | 219 +++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 .gitignore create mode 100644 lang_update/README.md create mode 100644 lang_update/lang_update.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..927c388 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +cldr-json diff --git a/lang_update/README.md b/lang_update/README.md new file mode 100644 index 0000000..1bd6704 --- /dev/null +++ b/lang_update/README.md @@ -0,0 +1,8 @@ +# Language Names Updater + +This script tries to pull language names from the sources: + +- Unicode CLDR +- Google Translate + +If you would like to work on language names, please contribute to [Unicode CLDR](https://cldr.unicode.org/). diff --git a/lang_update/lang_update.py b/lang_update/lang_update.py new file mode 100644 index 0000000..141ee1d --- /dev/null +++ b/lang_update/lang_update.py @@ -0,0 +1,219 @@ +import json +import os +import re +import requests +import subprocess +from bs4 import BeautifulSoup + + +LANGUAGES = { + "af": "Afrikaans", + "sq": "Albanian", + "am": "Amharic", + "ar": "Arabic", + "hy": "Armenian", + "az": "Azerbaijani", + "eu": "Basque", + "be": "Belarusian", + "bn": "Bengali", + "bs": "Bosnian", + "bg": "Bulgarian", + "ca": "Catalan", + "ceb": "Cebuano", + "ny": "Chichewa", + "zh": "Chinese", + "zh-CN": "Chinese (Simplified)", + "zh-TW": "Chinese (Traditional)", + "co": "Corsican", + "hr": "Croatian", + "cs": "Czech", + "da": "Danish", + "nl": "Dutch", + "en": "English", + "eo": "Esperanto", + "et": "Estonian", + "tl": "Filipino", + "fi": "Finnish", + "fr": "French", + "fy": "Frisian", + "gl": "Galician", + "ka": "Georgian", + "de": "German", + "el": "Greek", + "gu": "Gujarati", + "ht": "Haitian Creole", + "ha": "Hausa", + "haw": "Hawaiian", + "iw": "Hebrew", + "he": "Hebrew", + "hi": "Hindi", + "hmn": "Hmong", + "hu": "Hungarian", + "is": "Icelandic", + "ig": "Igbo", + "id": "Indonesian", + "ga": "Irish", + "it": "Italian", + "ja": "Japanese", + "jw": "Javanese", + "kn": "Kannada", + "kk": "Kazakh", + "km": "Khmer", + "rw": "Kinyarwanda", + "ko": "Korean", + "ku": "Kurdish (Kurmanji)", + "ky": "Kyrgyz", + "lo": "Lao", + "la": "Latin", + "lv": "Latvian", + "lt": "Lithuanian", + "lb": "Luxembourgish", + "mk": "Macedonian", + "mg": "Malagasy", + "ms": "Malay", + "ml": "Malayalam", + "mt": "Maltese", + "mi": "Maori", + "mr": "Marathi", + "mn": "Mongolian", + "my": "Myanmar (Burmese)", + "ne": "Nepali", + "no": "Norwegian", + "or": "Odia (Oriya)", + "ps": "Pashto", + "fa": "Persian", + "pl": "Polish", + "pt": "Portuguese", + "pa": "Punjabi", + "ro": "Romanian", + "ru": "Russian", + "sm": "Samoan", + "gd": "Scots Gaelic", + "sr": "Serbian", + "st": "Sesotho", + "sn": "Shona", + "sd": "Sindhi", + "si": "Sinhala", + "sk": "Slovak", + "sl": "Slovenian", + "so": "Somali", + "es": "Spanish", + "su": "Sundanese", + "sw": "Swahili", + "sv": "Swedish", + "tg": "Tajik", + "ta": "Tamil", + "tt": "Tatar", + "te": "Telugu", + "th": "Thai", + "tr": "Turkish", + "tk": "Turkmen", + "uk": "Ukrainian", + "ur": "Urdu", + "ug": "Uyghur", + "uz": "Uzbek", + "vi": "Vietnamese", + "cy": "Welsh", + "xh": "Xhosa", + "yi": "Yiddish", + "yo": "Yoruba", + "zu": "Zulu", +} + +CLDR_NAMES = { + "zh_CN": "zh-Hans", + "zh_TW": "zh-Hant", + "zh-CN": "zh-Hans", + "zh-TW": "zh-Hant", +} + +DIALECT_NAMES = { + "zh-Hans": "zh-CN", + "zh-Hant": "zh-TW", +} + + +# Add any language to this list to exclude it from the automated process. +EXCLUDE_LIST = [] + + +if not os.path.isdir("cldr-json"): + print("Cloning Unicode CLDR repository...") + subprocess.call(["git", "clone", "https://github.com/unicode-org/cldr-json"]) + +linguas_file = open("../LINGUAS", "r") +for lang in linguas_file: + lang = lang.strip() + if lang and lang not in EXCLUDE_LIST: + cldr_present = True # Assume CLDR file is present. + cldr_lang = CLDR_NAMES[lang] if lang in CLDR_NAMES else lang.replace("_", "-") + + g_lang = lang.split("_")[0] + + print(f"Reading {lang}.po ...") + + lang_file = open(f"../{lang}.po", "r") + lang_file_contents = lang_file.read() + lang_file.close() + + try: + print("Looking for required CLDR file...") + cldr_file = open( + f"cldr-json/cldr-json/cldr-localenames-full/main/{cldr_lang}/languages.json", + "r", + ) + cldr_json = json.load(cldr_file) + except FileNotFoundError: + print(f"No CLDR file found for language: {cldr_lang}.") + try: + cldr_lang = g_lang + cldr_file = open( + f"cldr-json/cldr-json/cldr-localenames-full/main/{cldr_lang}/languages.json", + "r", + ) + cldr_json = json.load(cldr_file) + print(f"Using file for {cldr_lang} instead.") + except FileNotFoundError: + print("Could not find possible substitutes.") + cldr_present = False # Correct earlier assumption. + + if cldr_present: + cldr_langs = cldr_json["main"][cldr_lang]["localeDisplayNames"]["languages"] + for lang_code, lang_name in cldr_langs.items(): + if lang_code in DIALECT_NAMES: + lang_code = DIALECT_NAMES[lang_code] + + if lang_code not in LANGUAGES: + continue + + lang_file_contents = re.sub( + rf'msgid "{LANGUAGES[lang_code]}"\nmsgstr ".*"\n', + rf'msgid "{LANGUAGES[lang_code]}"\nmsgstr "{lang_name}"\n', + lang_file_contents, + ) + else: + print("Fetching localized names from Google Translate...") + + page = requests.get("https://translate.google.com/?hl=" + g_lang) + soup = BeautifulSoup(page.text, "html5lib") + + print("Generating updated string with localized names...") + + for div in soup.find_all("div"): + if div.attrs.get("class", None) == ["qSb8Pe"]: + lang_code = div.attrs["data-language-code"] + lang_name = div.find(attrs={"class": "Llmcnf"}).string + + lang_file_contents = re.sub( + rf'msgid "{LANGUAGES[lang_code]}"\nmsgstr ".*"\n', + rf'msgid "{LANGUAGES[lang_code]}"\nmsgstr "{lang_name}"\n', + lang_file_contents, + ) + + print(f"Saving {lang}.po ...") + + lang_file = open(f"../{lang}.po", "w") + lang_file.write(lang_file_contents) + lang_file.close() + + print()