判断文件编码是否为UTF-8

2008-09-15 15:12 | 分类:C/C++, 原创 | 标签:, , | 作者:Aaron | 304 views

如何用程序来判断文件的编码呢?以下提供一个判断编码为UTF-8的方法。只需要把文件中的字符串传入函数即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
bool isUtf8(const char *buf)
{
    int i, n;
    register unsigned char c;
    bool gotone = false;
 
#define F 0  /* character never appears in text */
#define T 1  /* character appears in plain ASCII text */
#define I 2  /* character appears in ISO-8859 text */
#define X 3  /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 
    static const unsigned char text_chars[256] = {
        /*            BEL BS HT LF  FF CR  */
        F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
        /*                    ESC      */
        F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
        T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
        /*        NEL                  */
        X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
        X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
        I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I  /* 0xfX */
    };
 
    /* *ulen = 0; */
    for (i = 0; (c = buf[i ]); i++) {
        if ((c & 0x80) == 0) {    /* 0xxxxxxx is plain ASCII */
            /*
             * Even if the whole file is valid UTF-8 sequences,
             * still reject it if it uses weird control characters.
             */
 
            if (text_chars[c] != T)
                return false;
 
        } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
            return false;
        } else {                  /* 11xxxxxx begins UTF-8 */
            int following;
 
            if ((c & 0x20) == 0) {        /* 110xxxxx */
                following = 1;
            } else if ((c & 0x10) == 0) {    /* 1110xxxx */
                following = 2;
            } else if ((c & 0x08) == 0) {    /* 11110xxx */
                following = 3;
            } else if ((c & 0x04) == 0) {    /* 111110xx */
                following = 4;
            } else if ((c & 0x02) == 0) {    /* 1111110x */
                following = 5;
            } else
                return false;
 
            for (n = 0; n < following; n++) {
                i++;
                if (!(c = buf[i ]))
                    goto done;
 
                if ((c & 0x80) == 0 || (c & 0x40))
                    return false;
            }
            gotone = true;
        }
    }
done:
    return gotone;  /* don't claim it's UTF-8 if it's all 7-bit */
}
 
#undef F
#undef T
#undef I
#undef X

以下为样例用法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include <stdio.h>
#include <stdlib.h>
 
int main()
{
    FILE *fp;
    char string[81];
 
    if ((fp = fopen("1.txt", "r")) != NULL)
    {
        while(!feof(fp))
        {
            fgets(string, 80, fp);
            if (isUtf8(string))
            {
                printf("The file is utf-8!\n");
                exit(0);
            }
        }
    }
 
    printf("The file is not utf-8.\n");
    return 0;
}

发表您的评论

您的名字:

您的邮箱: (*不会被公布

您的网站: