ROOTPLOIT

File: //opt/gsutil/third_party/charset_normalizer/tests/test_full_detection.py
from charset_normalizer.api import from_path
import pytest
from os import path, pardir

DIR_PATH = path.join(
    path.dirname(path.realpath(__file__)),
    pardir
)


@pytest.mark.parametrize(
    "input_data_file, expected_charset, expected_language",
    [
        ('sample-arabic-1.txt', 'cp1256', 'Arabic'),
        ('sample-french-1.txt', 'cp1252', 'French'),
        ('sample-arabic.txt', 'utf_8', 'Arabic'),
        ('sample-russian-3.txt', 'utf_8', 'Russian'),
        ('sample-french.txt', 'utf_8', 'French'),
        ('sample-chinese.txt', 'big5', 'Chinese'),
        ('sample-greek.txt', 'cp1253', 'Greek'),
        ('sample-greek-2.txt', 'cp1253', 'Greek'),
        ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'),
        ('sample-hebrew-3.txt', 'cp1255', 'Hebrew'),
        ('sample-bulgarian.txt', 'utf_8', 'Bulgarian'),
        ('sample-english.bom.txt', 'utf_8', 'English'),
        ('sample-spanish.txt', 'utf_8', 'Spanish'),
        ('sample-korean.txt', 'cp949', 'Korean'),
        ('sample-turkish.txt', 'cp1254', 'Turkish'),
        ('sample-russian-2.txt', 'utf_8', 'Russian'),
        ('sample-russian.txt', 'mac_cyrillic', 'Russian'),
        ('sample-polish.txt', 'utf_8', 'Polish'),
    ]
)
def test_elementary_detection(
    input_data_file: str,
    expected_charset: str,
    expected_language: str,
):
    best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best()

    assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file)
    assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file)
    assert best_guess.language == expected_language, "Elementary language detection has failed upon '{}'".format(input_data_file)