diff --git a/lang_update/README.md b/lang_update/README.md deleted file mode 100644 index 3a49ff7..0000000 --- a/lang_update/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Language Names Updater - -This script tries to pull language names from the sources: - -- Unicode CLDR -- Google Translate - -## How to use - -```bash -git clone https://github.com/dialect-app/po -cd po/lang_update -python lang_update.py -``` - -`lang_update.py` should be run with `po/lang_update` as the working directory and will not work as expected otherwise. - -`-g` or `--google` can be passed to force the usage of Google Translate as the source for language names. - -You can also pass a language code to only update one language: - -```bash -python lang_update.py "ca" -``` - -## How to contribute - -If you would like to work on language names, please contribute to [Unicode CLDR](https://cldr.unicode.org/). - -If you decide that the language names from Unicode CLDR are not good enough and feel like you could do a better job, you can open an issue at [dialect-app/po](https://github.com/dialect-app/po/issues) and continue updating your translation as per usual. You could also instead add your language code to the `EXCLUDE_LIST` in the `lang_update.py` script and send a PR. - -The `lang_update.py` script has a few things you could help with as well: - -- The `EXCLUDE_LIST` list could be expanded or shortened depending on the accuracy of Unicode CLDR project's language names for a particular language. You could do this by checking `cldr-json`. For example: [French Unicode CLDR languages.json](https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-localenames-full/main/fr/languages.json) . The link format is: - ``` - https://github.com/unicode-org/cldr-json/blob/main/cldr-json/cldr-localenames-full/main/{language_code_here}/languages.json - ``` -- If language names should be capitalized in your language, add the language code to `CAPS_LIST`. -- If your language is named differently in the Unicode CLDR project, add a mapping in `CLDR_NAMES`. diff --git a/lang_update/lang_update.py b/lang_update/lang_update.py deleted file mode 100644 index 22c3027..0000000 --- a/lang_update/lang_update.py +++ /dev/null @@ -1,252 +0,0 @@ -import argparse -import json -import os -import re -import requests -import subprocess -from bs4 import BeautifulSoup - - -LANGUAGES = { - "af": "Afrikaans", - "sq": "Albanian", - "am": "Amharic", - "ar": "Arabic", - "hy": "Armenian", - "az": "Azerbaijani", - "eu": "Basque", - "be": "Belarusian", - "bn": "Bengali", - "bs": "Bosnian", - "bg": "Bulgarian", - "ca": "Catalan", - "ceb": "Cebuano", - "ny": "Chichewa", - "zh": "Chinese", - "zh-Hans": "Chinese (Simplified)", - "zh-Hant": "Chinese (Traditional)", - "co": "Corsican", - "hr": "Croatian", - "cs": "Czech", - "da": "Danish", - "nl": "Dutch", - "en": "English", - "eo": "Esperanto", - "et": "Estonian", - "tl": "Filipino", - "fi": "Finnish", - "fr": "French", - "fy": "Frisian", - "gl": "Galician", - "ka": "Georgian", - "de": "German", - "el": "Greek", - "gu": "Gujarati", - "ht": "Haitian Creole", - "ha": "Hausa", - "haw": "Hawaiian", - "iw": "Hebrew", - "he": "Hebrew", - "hi": "Hindi", - "hmn": "Hmong", - "hu": "Hungarian", - "is": "Icelandic", - "ig": "Igbo", - "id": "Indonesian", - "ga": "Irish", - "it": "Italian", - "ja": "Japanese", - "jw": "Javanese", - "kn": "Kannada", - "kk": "Kazakh", - "km": "Khmer", - "rw": "Kinyarwanda", - "ko": "Korean", - "ku": "Kurdish (Kurmanji)", - "ky": "Kyrgyz", - "lo": "Lao", - "la": "Latin", - "lv": "Latvian", - "lt": "Lithuanian", - "lb": "Luxembourgish", - "mk": "Macedonian", - "mg": "Malagasy", - "ms": "Malay", - "ml": "Malayalam", - "mt": "Maltese", - "mi": "Maori", - "mr": "Marathi", - "mn": "Mongolian", - "my": "Myanmar (Burmese)", - "ne": "Nepali", - "no": "Norwegian", - "or": "Odia (Oriya)", - "ps": "Pashto", - "fa": "Persian", - "pl": "Polish", - "pt": "Portuguese", - "pa": "Punjabi", - "ro": "Romanian", - "ru": "Russian", - "sm": "Samoan", - "gd": "Scots Gaelic", - "sr": "Serbian", - "st": "Sesotho", - "sn": "Shona", - "sd": "Sindhi", - "si": "Sinhala", - "sk": "Slovak", - "sl": "Slovenian", - "so": "Somali", - "es": "Spanish", - "su": "Sundanese", - "sw": "Swahili", - "sv": "Swedish", - "tg": "Tajik", - "ta": "Tamil", - "tt": "Tatar", - "te": "Telugu", - "th": "Thai", - "tr": "Turkish", - "tk": "Turkmen", - "uk": "Ukrainian", - "ur": "Urdu", - "ug": "Uyghur", - "uz": "Uzbek", - "vi": "Vietnamese", - "cy": "Welsh", - "xh": "Xhosa", - "yi": "Yiddish", - "yo": "Yoruba", - "zu": "Zulu", -} - -CLDR_NAMES = { - "kmr": "ku", # They seem to be the same since Kurmanji (ku) is Northern Kurdish (kmr). - "zh_CN": "zh-Hans", - "zh_TW": "zh-Hant", - "zh-CN": "zh-Hans", - "zh-TW": "zh-Hant", -} - -# Add any language to this list to exclude it from the automated process. -EXCLUDE_LIST = [ - # No decent source - "oc", # Occitan - # Was manually updated by the translator - "eo", # Esperanto - "fr", # French - "fy", # Frisian - "ja", # Japanese - "lv", # Latvian - "uk", # Ukranian - "zh_CN", # Chinese -] -# Even if a language is in this list, you can pass it as a parameter to the program. - -# If any language is on this list and it shouldn't be, please create an issue: -# https://www.github.com/dialect-app/po -# You can also open an issue if any language should be added. - -# All languages that need "capitalization" -CAPS_LIST = [ - "it", # Italian -] - -parser = argparse.ArgumentParser() -parser.add_argument( - "language", nargs="?", help="the language code for language to update" -) -parser.add_argument( - "-g", "--google", help="force use google for language names", action="store_true" -) -args = parser.parse_args() - -if not os.path.isdir("cldr-json"): - print("Cloning Unicode CLDR repository...") - subprocess.call(["git", "clone", "https://github.com/unicode-org/cldr-json"]) - - -def process_language(lang, arged=False): - lang = lang.strip() - if lang and (arged is True or lang not in EXCLUDE_LIST): - cldr_present = True # Assume CLDR file is present. - cldr_lang = CLDR_NAMES[lang] if lang in CLDR_NAMES else lang.replace("_", "-") - - g_lang = lang.split("_")[0] - - print(f"Reading {lang}.po ...") - - lang_file = open(f"../{lang}.po", "r") - lang_file_contents = lang_file.read() - lang_file.close() - - try: - print("Looking for required CLDR file...") - cldr_file = open( - f"cldr-json/cldr-json/cldr-localenames-full/main/{cldr_lang}/languages.json", - "r", - ) - cldr_json = json.load(cldr_file) - except FileNotFoundError: - print(f"No CLDR file found for language: {cldr_lang}.") - try: - cldr_lang = g_lang - cldr_file = open( - f"cldr-json/cldr-json/cldr-localenames-full/main/{cldr_lang}/languages.json", - "r", - ) - cldr_json = json.load(cldr_file) - print(f"Using file for {cldr_lang} instead.") - except FileNotFoundError: - print("Could not find possible substitutes.") - cldr_present = False # Correct earlier assumption. - - if cldr_present and not args.google: - cldr_langs = cldr_json["main"][cldr_lang]["localeDisplayNames"]["languages"] - for lang_code, lang_name in cldr_langs.items(): - if lang_code not in LANGUAGES: - continue - - if cldr_lang in CAPS_LIST: - lang_name = lang_name.capitalize() - - lang_file_contents = re.sub( - rf'msgid "{re.escape(LANGUAGES[lang_code])}"\nmsgstr ".*"\n', - rf'msgid "{LANGUAGES[lang_code]}"\nmsgstr "{lang_name}"\n', - lang_file_contents, - ) - else: - print("Fetching localized names from Google Translate...") - - page = requests.get("https://translate.google.com/?hl=" + g_lang) - soup = BeautifulSoup(page.text, "html5lib") - - print("Generating updated string with localized names...") - - for div in soup.find_all("div"): - if div.attrs.get("class", None) == ["qSb8Pe"]: - lang_code = div.attrs["data-language-code"] - lang_name = div.find(attrs={"class": "Llmcnf"}).string - - lang_file_contents = re.sub( - rf'msgid "{re.escape(LANGUAGES[lang_code])}"\nmsgstr ".*"\n', - rf'msgid "{LANGUAGES[lang_code]}"\nmsgstr "{lang_name}"\n', - lang_file_contents, - ) - - print(f"Saving {lang}.po ...") - - lang_file = open(f"../{lang}.po", "w") - lang_file.write(lang_file_contents) - lang_file.close() - - print() - - -if args.language: - process_language(args.language, True) -else: - linguas_file = open("../LINGUAS", "r") - for lang in linguas_file: - process_language(lang)