// Generate the table for partially_normalise_unicode() in ../unicode.c
// Placed in the public domain.

// g++ -Wall --std=c++11 generate_unicode_compose_table.cpp ../unicode.c -o table

#include <assert.h>
#include <stdio.h>
#include <algorithm>
#include <vector>

wchar_t *utf8_decode(unsigned char *input, ssize_t *length);

#define MAX_CODEPOINT 0x400

struct entry {
    int src1, src2, dest;
    bool operator<(const entry &rhs ) const {
        if (src1 < rhs.src1) return true;
        if (src1 > rhs.src1) return false;
        return (src2 < rhs.src2);
    }
};

std::vector<entry> rules;

int main() {
    // I don't think this table is totally complete, but it has a lot that we omit. It is derived from
    // https://github.com/r12a/r12a.github.io/blob/master/code/normalization/js/n11ndata-lite.js
    FILE *file = fopen("unicode_decompositions.txt","r");
    if (!file) perror("Opening unicode_decompositions.txt"), abort();
    char *line;
    size_t buflen = 0;
    for (int linenum = 0; ; linenum++) {
            if (getline(&line, &buflen, file) == -1)
                break;

            ssize_t numchars = 0;
            wchar_t* uline =  utf8_decode((unsigned char*)line, &numchars);
            if (numchars == 4) {
                // Not a decomposition, but an equivalence between normal and compatibility forms
                // We could include rules for these...
                //printf("%d -> %d\n", uline[2], uline[0]);
                free(uline);
                continue;
            }
            assert(numchars == 5);
            assert(uline[1] == ':');
            assert(uline[4] == '\n');

            if (uline[0] < MAX_CODEPOINT) {
                rules.push_back(entry());
                rules.back().src1 = uline[2];
                rules.back().src2 = uline[3];
                rules.back().dest = uline[0];
            }
            //printf("%d %d -> %d\n", uline[2], uline[3], uline[0]);
            free(uline);
    }

    // Sort to allow bisection search
    sort(rules.begin(), rules.end());

    printf("// Generated by misc/generate_unicode_compose_table.c\n");
    printf("// This table only contains composition rules that generate\n");
    printf("// characters below 0x%x\n", MAX_CODEPOINT);
    printf("static const short compose_table[] = {");

    int line_length = 999;
    for (auto rule=rules.begin(); rule!=rules.end(); ++rule) {
        if (line_length > 70) {
            line_length = printf("\n    ");
        }
        //printf("%c %d %d %d\n", rule->src1, rule->src1, rule->src2, rule->dest);
        line_length += printf("%d,%d,%d,", rule->src1, rule->src2, rule->dest);
    }
    printf("\n};\n#define N_COMPOSE_RULES %zd\n", rules.size());

    return 0;
}
