/*
    Copyright (C) HWPORT.COM
    All rights reserved.
    Author: JAEHYUK CHO <mailto:minzkn@minzkn.com>
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static int hwport_utf8_count_msb(unsigned int s_value);
static int hwport_utf8_print_characters(const char *s_utf8_string, size_t s_size);

static size_t hwport_utf8_cut_size(size_t s_want_size, const char *s_utf8_string, size_t s_utf8_string_size, size_t *s_character_count_ptr);

int main(int s_argc, char **s_argv);

static int hwport_utf8_count_msb(unsigned int s_value)
{
    int s_result;

    for(s_result = 0;(s_result < 8) && (((s_value >> (7 - s_result)) & 1u) == 1u);s_result++);

    return(s_result);
}

static void print_unit(const void *s_ptr, size_t s_size)
{
    unsigned char *s_dup;
    size_t s_offset;

    s_dup = (unsigned char *)malloc(s_size + (size_t)1u);

    (void)memcpy((void *)s_dup, s_ptr, s_size);
    s_dup[s_size] = 0;

    if(hwport_utf8_count_msb((unsigned int)s_dup[0]) == 1) { 
        (void)fprintf(stdout, "X");
    }
    for(s_offset = (size_t)0u;s_offset < s_size;s_offset++) {
        (void)fprintf(
	    stdout,
	    "%c%02X",
	    (s_offset == ((size_t)0u)) ? '[' : ' ',
	    (unsigned int)s_dup[s_offset]
	);
        if(s_dup[s_offset] == 0u) { s_dup[s_offset] = ' '; }
    }
    (void)fprintf(stdout, "(\x1b[1;33m%s\x1b[0m)]\n", (char *)s_dup);

    if(s_dup != NULL) { free((void *)s_dup); }
}

static int hwport_utf8_print_characters(const char *s_utf8_string, size_t s_size)
{
    size_t s_offset;
    int s_count_msb;

    for(s_offset = (size_t)0u;;) {
	if(s_offset >= s_size) { break; }
        s_count_msb = hwport_utf8_count_msb((unsigned int)s_utf8_string[s_offset]);

        if(s_count_msb == 0) { /* U+0000 ~ U+007f */
	    print_unit((const void *)(&s_utf8_string[s_offset]), (size_t)1u);
	    ++s_offset;
	}
        else if(s_count_msb == 1) {
	    (void)fprintf(stdout, "invalid utf8 sequence ! (offset=%lu)\n", (unsigned long)s_offset);
	    break; 
	}
	else {
	    size_t s_unit_offset;
	    int s_valid = 0;
	    for(s_unit_offset = (size_t)1u;s_unit_offset < ((size_t)s_count_msb);s_unit_offset++) {
	        if((s_offset + s_unit_offset) >= s_size) {
		    s_valid = 0;
		    break;
		}
	        if(hwport_utf8_count_msb((unsigned int)(s_utf8_string[s_offset + s_unit_offset])) != 1) {
		    s_valid = 0;
		    break;
		}

		s_valid = 1;
	    }

	    if(s_valid == 0) {
	        (void)fprintf(stdout, "invalid utf8 sequence ! (offset=%lu, unit_offset=%lu)\n", (unsigned long)s_offset, (unsigned long)s_unit_offset);
	        break;
	    }
	    
	    print_unit((const void *)(&s_utf8_string[s_offset]), (size_t)s_count_msb);
	    s_offset += (size_t)s_count_msb;
	}
    }

    return((int)s_offset);
}

static size_t hwport_utf8_cut_size(size_t s_want_size, const char *s_utf8_string, size_t s_utf8_string_size, size_t *s_character_count_ptr)
{
    int s_count_msb, s_valid;
    size_t s_offset, s_unit_offset, s_character_count;

    if(s_utf8_string_size > s_want_size) { s_utf8_string_size = s_want_size; }

    for(s_offset = (size_t)0u, s_character_count = (size_t)0u;s_offset < s_utf8_string_size;) {
        s_count_msb = hwport_utf8_count_msb((unsigned int)s_utf8_string[s_offset]);
        if(s_count_msb == 1) { /* invalid sequence */ break; }
        if(s_count_msb >= 5) { /* invalid sequence (RFC3629) */ break; }
        if(s_count_msb == 0) { /* U+0000 ~ U+007F */ ++s_offset, ++s_character_count; }
        else { /* U+0080 ~ */
            for(s_valid = 0, s_unit_offset = (size_t)1u;s_unit_offset < ((size_t)s_count_msb);s_unit_offset++) {
	        if((s_offset + s_unit_offset) >= s_utf8_string_size) {
		    s_valid = 0;
		    break;
		}
	        if(hwport_utf8_count_msb((unsigned int)(s_utf8_string[s_offset + s_unit_offset])) != 1) {
		    s_valid = 0;
		    break;
		}

		s_valid = 1;
            }

	    if(s_valid == 0) { /* invalid sequence */ break; }

	    s_offset += (size_t)s_count_msb;
	    ++s_character_count;
        }
    }

    if(s_character_count_ptr != ((size_t *)0)) {
        *(s_character_count_ptr) = s_character_count;
    }

    return(s_offset);
}

int main(int s_argc, char **s_argv)
{
    static const char cg_input_utf8[] = {
        "Hello world.\0한글과 영문의 글자를 적절히 표현하기 위해서는? 어떻게"
    };
    size_t s_utf8_string_size;
    const char *s_utf8_string;

    (void)s_argc;
    (void)s_argv;

    if(s_argc >= 2) {
        s_utf8_string = (const char *)s_argv[1];
        s_utf8_string_size = strlen(s_utf8_string);
    }
    else {
        s_utf8_string = (const char *)(&cg_input_utf8[0]);
	s_utf8_string_size = sizeof(cg_input_utf8);
    }
   
    do {
        (void)fprintf(stdout, "=== TEST hwport_utf8_print_characters ===\n");
        (void)hwport_utf8_print_characters(s_utf8_string, s_utf8_string_size);
        (void)fprintf(stdout, "\n");
    }while(0);

    do {
        size_t s_want_size, s_limited_size, s_character_count;

        for(s_want_size = (size_t)0u;s_want_size < s_utf8_string_size;s_want_size++) {
	    s_limited_size = hwport_utf8_cut_size(s_want_size, s_utf8_string, s_utf8_string_size, (size_t *)(&s_character_count));
	    (void)fprintf(stdout, "=== WANT_SIZE=%lu (limtied_size=%lu, character_count=%lu) ===\n", (unsigned long)s_want_size, (unsigned long)s_limited_size, (unsigned long)s_character_count);
            print_unit(s_utf8_string, s_limited_size);
            (void)fprintf(stdout, "\n");
	}
    }while(0);

    return(EXIT_SUCCESS);
}

/* vim: set expandtab: */
/* End of source */
