# Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
#
# This file is part of Rast.
# See the file COPYING for redistribution information.
#

# -*- mode: Ruby; coding: euc-japan; -*-

require "test/unit"

require "rast_test"

module Rast
  class Encoding
    class UTF8Test < Test::Unit::TestCase
      def setup
        @encoding = Encoding["utf8"]
      end

      def test_register_tokenize
        result = []
        @encoding.register_tokenize("ruby is great.") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["rub", 0, true], result[0])
        assert_equal(["uby", 1, true], result[1])
        assert_equal(["by ", 2, true], result[2])
        assert_equal(["y i", 3, true], result[3])
        assert_equal([" is", 4, true], result[4])
        assert_equal(["is ", 5, true], result[5])
        assert_equal(["s g", 6, true], result[6])
        assert_equal([" gr", 7, true], result[7])
        assert_equal(["gre", 8, true], result[8])
        assert_equal(["rea", 9, true], result[9])
        assert_equal(["eat", 10, true], result[10])
        assert_equal(["at", 11, true], result[11])
        assert_equal(["t.", 12, true], result[12])
        assert_equal([".", 13, false], result[13])
        assert_equal(14, result.length)

        result = []
        @encoding.register_tokenize("アイウエオ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("アイウ", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("イウエ", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("ウエオ", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("エオ", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(false, result[3][2])
        assert_equal("オ", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(false, result[4][2])
        assert_equal(5, result.length)

        result = []
        @encoding.register_tokenize("あいうえお") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("あいう", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("いうえ", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("うえお", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("えお", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(false, result[3][2])
        assert_equal("お", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(false, result[4][2])
        assert_equal(5, result.length)

        result = []
        @encoding.register_tokenize("日本語") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("日本", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("本語", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("語", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(false, result[2][2])
        assert_equal(3, result.length)

        result = []
        @encoding.register_tokenize("Rubyですよ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("Rub", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("uby", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("by", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("yで", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(true, result[3][2])
        assert_equal("ですよ", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(true, result[4][2])
        assert_equal("すよ", result[5][0])
        assert_equal(5, result[5][1])
        assert_equal(false, result[5][2])
        assert_equal("よ", result[6][0])
        assert_equal(6, result[6][1])
        assert_equal(false, result[6][2])
        assert_equal(7, result.length)

        result = []
        @encoding.register_tokenize("") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(0, result.length)

        result = []
        @encoding.register_tokenize("a") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("a", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(false, result[0][2])
        assert_equal(1, result.length)

        result = []
        @encoding.register_tokenize("あ") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("あ", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(false, result[0][2])
        assert_equal(1, result.length)

        result = []
        @encoding.register_tokenize("DE    F g") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["DE ", 0, true], result[0])
        assert_equal(["E  ", 1, true], result[1])
        assert_equal(["   ", 2, true], result[2])
        assert_equal(["   ", 3, true], result[3])
        assert_equal(["  F", 4, true], result[4])
        assert_equal([" F ", 5, true], result[5])
        assert_equal(["F g", 6, true], result[6])
        assert_equal([" g", 7, false], result[7])
        assert_equal(["g", 8, false], result[8])
        assert_equal(9, result.length)

        result = []
        @encoding.register_tokenize("100000") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["100", 0, true], result[0])
        assert_equal(["000", 1, true], result[1])
        assert_equal(["000", 2, true], result[2])
        assert_equal(["000", 3, true], result[3])
        assert_equal(["00", 4, false], result[4])
        assert_equal(["0", 5, false], result[5])
        assert_equal(6, result.length)

        result = []
        @encoding.register_tokenize("abc\0def") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["abc", 0, true], result[0])
        assert_equal(["bc", 1, true], result[1])
        assert_equal(["c\0", 2, true], result[2])
        assert_equal(["\0d", 3, true], result[3])
        assert_equal(["def", 4, true], result[4])
        assert_equal(["ef", 5, false], result[5])
        assert_equal(["f", 6, false], result[6])
        assert_equal(7, result.length)

        result = []
        @encoding.register_tokenize("\xE3") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["\xE3", 0, false], result[0])
        assert_equal(1, result.length)

        result = []
        @encoding.register_tokenize("\xE3\x81\x82\xE3") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["\xE3\x81\x82\xE3", 0, true], result[0])
        assert_equal(["\xE3", 1, false], result[1])
        assert_equal(2, result.length)

        result = []
        @encoding.register_tokenize("\xE3\x81\x82\xE3\x81\x82\xE3") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["\xE3\x81\x82\xE3\x81\x82", 0, true], result[0])
        assert_equal(["\xE3\x81\x82\xE3", 1, true], result[1])
        assert_equal(["\xE3", 2, false], result[2])
        assert_equal(3, result.length)
      end

      def test_search_tokenize
        result = []
        @encoding.search_tokenize("ruby is great.") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["rub", 0, true], result[0])
        assert_equal(["uby", 1, true], result[1])
        assert_equal(["by ", 2, true], result[2])
        assert_equal(["y i", 3, true], result[3])
        assert_equal([" is", 4, true], result[4])
        assert_equal(["is ", 5, true], result[5])
        assert_equal(["s g", 6, true], result[6])
        assert_equal([" gr", 7, true], result[7])
        assert_equal(["gre", 8, true], result[8])
        assert_equal(["rea", 9, true], result[9])
        assert_equal(["eat", 10, true], result[10])
        assert_equal(["at", 11, true], result[11])
        assert_equal(["t.", 12, true], result[12])
        assert_equal(13, result.length)

        result = []
        @encoding.search_tokenize("アイウエオ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("アイウ", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("イウエ", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("ウエオ", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal(3, result.length)

        result = []
        @encoding.search_tokenize("あいうえお") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("あいう", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("いうえ", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("うえお", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal(3, result.length)

        result = []
        @encoding.search_tokenize("日本語") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("日本", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("本語", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal(2, result.length)

        result = []
        @encoding.search_tokenize("Rubyですよ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("Rub", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("uby", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("by", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("yで", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(true, result[3][2])
        assert_equal("ですよ", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(true, result[4][2])
        assert_equal(5, result.length)

        result = []
        @encoding.search_tokenize("") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(0, result.length)

        result = []
        @encoding.search_tokenize("a") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["a", 0, false], result[0])
        assert_equal(1, result.length)

        result = []
        @encoding.search_tokenize("あ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("あ", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(false, result[0][2])
        assert_equal(1, result.length)

        result = []
        @encoding.search_tokenize("DE    F g") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["DE ", 0, true], result[0])
        assert_equal(["E  ", 1, true], result[1])
        assert_equal(["   ", 2, true], result[2])
        assert_equal(["   ", 3, true], result[3])
        assert_equal(["  F", 4, true], result[4])
        assert_equal([" F ", 5, true], result[5])
        assert_equal(["F g", 6, true], result[6])
        assert_equal(7, result.length)

        result = []
        @encoding.search_tokenize("100000") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["100", 0, true], result[0])
        assert_equal(["000", 1, true], result[1])
        assert_equal(["000", 2, true], result[2])
        assert_equal(["000", 3, true], result[3])
        assert_equal(4, result.length)

        result = []
        @encoding.search_tokenize("コート") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["コート", 0, true], result[0])
        assert_equal(1, result.length)
      end

      def test_normalize_text
        assert_equal(" abc ", @encoding.normalize_text("  abc  "))
        assert_equal(" abc abc", @encoding.normalize_text(" abc\nabc"))
        assert_equal("a b c d e ",
                     @encoding.normalize_text("a\n \t b\nc\r\rd \ne "))

        s = @encoding.normalize_text("ｱｲｳ")
        assert_equal("アイウ", s)
        s = @encoding.normalize_text("アイウｱｲｳ")
        assert_equal("アイウアイウ", s)
        s = @encoding.normalize_text("ﾀﾁﾂﾃﾄ")
        assert_equal("タチツテト", s)
        s = @encoding.normalize_text("ｻｼｽｾｿﾀﾁﾂﾃﾄ")
        assert_equal("サシスセソタチツテト", s)
        s = @encoding.normalize_text("ｶﾞｷﾞｸﾞｹﾞｺﾞ")
        assert_equal("ガギグゲゴ", s)
        s = @encoding.normalize_text("ﾀﾞﾁﾞﾂﾞﾃﾞﾄﾞ")
        assert_equal("ダヂヅデド", s)
        s = @encoding.normalize_text("ﾊﾟﾋﾟﾌﾟﾍﾟﾎﾟ")
        assert_equal("パピプペポ", s)
        s = @encoding.normalize_text("ｳﾞ")
        assert_equal("ヴ", s)
        s = @encoding.normalize_text("ハﾞウﾞハﾟ")
        assert_equal("バヴパ", s)

        s = @encoding.normalize_text("　ＡＢＣ＋＆＜＞")
        assert_equal(" ABC+&<>", s)
        s = @encoding.normalize_text("　 　\n\r  \t 　")
        assert_equal(" ", s)
      end

      def test_normalize_chars
        s = @encoding.normalize_chars("ABC")
        assert_equal("abc", s)
        s = @encoding.normalize_chars("\xC3\x81")
        assert_equal("\xC3\xA1", s)

        # includes copyright sign: \xC2\xA9
        s = @encoding.normalize_chars("Ruby \xC2\xA9")
        assert_equal("ruby \xC2\xA9", s)

=begin
        # todo: includes invalid utf-8 character: \xA9
        s = @encoding.normalize_chars("Ruby \xA9")
        assert_equal("ruby \xA9", s)
=end
      end
    end
  end
end
