/* $OpenBSD: wsemul_subr.c,v 1.2 2023/03/06 17:14:44 miod Exp $ */ /* * Copyright (c) 2007, 2013 Miodrag Vallat. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice, this permission notice, and the disclaimer below * appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Part of the UTF-8 state machine logic borrowed from citrus_utf8.c * under the following licence: */ /*- * Copyright (c) 2002-2004 Tim J. Robbins * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include int wsemul_local_translate(u_int32_t, kbd_t, u_char *); /* * Get characters from an input stream and update the input state. * Processing stops when the stream is empty, or a complete character * sequence has been recognized, in which case it returns zero. */ int wsemul_getchar(const u_char **inbuf, u_int *inlen, struct wsemul_inputstate *state, int allow_utf8) { u_int len = *inlen; const u_char *buf = *inbuf; #ifdef HAVE_UTF8_SUPPORT int rc; u_int32_t tmpchar, lbound; u_int mbleft; #endif if (len == 0) return EAGAIN; #ifndef HAVE_UTF8_SUPPORT state->inchar = *buf++; state->mbleft = 0; len--; *inlen = len; *inbuf = buf; return 0; #else /* * If we do not allow multibyte sequences, process as quickly * as possible. */ if (!allow_utf8) { state->inchar = *buf++; state->mbleft = 0; len--; *inlen = len; *inbuf = buf; return 0; } rc = EAGAIN; tmpchar = state->inchar; lbound = state->lbound; mbleft = state->mbleft; while (len != 0) { u_int32_t frag = (u_int32_t)*buf++; len--; /* * If we are in the middle of a multibyte sequence, try * to complete it. */ if (mbleft != 0) { if ((frag & 0xc0) != 0x80) goto invalid; tmpchar = (tmpchar << 6) | (frag & 0x3f); mbleft--; if (mbleft == 0) { if (tmpchar < lbound) goto invalid; if (tmpchar >= 0xd800 && tmpchar < 0xe000) goto invalid; if (tmpchar >= 0x110000) goto invalid; rc = 0; break; } continue; } /* * Otherwise let's decide if this is the start of a new * multibyte sequence, or a 7-bit character. */ if ((frag & 0x80) == 0) { tmpchar = frag; rc = 0; break; } if ((frag & 0xe0) == 0xc0) { frag &= 0x1f; mbleft = 1; lbound = 0x80; } else if ((frag & 0xf0) == 0xe0) { frag &= 0x0f; mbleft = 2; lbound = 0x800; } else if ((frag & 0xf8) == 0xf0) { frag &= 0x07; mbleft = 3; lbound = 0x10000; } else { goto invalid; } tmpchar = frag; state->lbound = lbound; continue; invalid: /* Abort the ill-formed sequence and continue */ mbleft = 0; tmpchar = 0; rc = EILSEQ; } state->inchar = tmpchar; state->mbleft = mbleft; *inlen = len; *inbuf = buf; return rc; #endif } /* * Unicode Cyrillic to KOI8 translation table (starts at U+0400), * from RFC 2319. */ const u_int8_t cyrillic_to_koi8[] = { 0x00, /* IE grave */ /* 0400 */ 0xb3, /* IO */ 0x00, /* DJE */ 0x00, /* GJE */ 0xb4, /* UKR IE */ 0x00, /* DZE */ 0xb6, /* BYE/UKR I */ 0xb7, /* YI */ 0x00, /* JE */ 0x00, /* LJE */ 0x00, /* NJE */ 0x00, /* TSHE */ 0x00, /* KJE */ 0x00, /* I grave */ 0x00, /* short U */ 0x00, /* DZHE */ 0xe1, /* A */ /* 0410 */ 0xe2, /* BE */ 0xf7, /* VE */ 0xe7, /* GHE */ 0xe4, /* DE */ 0xe5, /* IE */ 0xf6, /* ZHE */ 0xfa, /* ZE */ 0xe9, /* I */ 0xea, /* short I */ 0xeb, /* KA */ 0xec, /* EL */ 0xed, /* EM */ 0xee, /* EN */ 0xef, /* O */ 0xf0, /* PE */ 0xf2, /* ER */ /* 0420 */ 0xf3, /* ES */ 0xf4, /* TE */ 0xf5, /* U */ 0xe6, /* EF */ 0xe8, /* HA */ 0xe3, /* TSE */ 0xfe, /* CHE */ 0xfb, /* SHA */ 0xfd, /* SHCHA */ 0xff, /* HARD SIGN */ 0xf9, /* YERU */ 0xf8, /* SOFT SIGN */ 0xfc, /* E */ 0xe0, /* YU */ 0xf1, /* YA */ 0xc1, /* a */ /* 0430 */ 0xc2, /* be */ 0xd7, /* ve */ 0xc7, /* ghe */ 0xc4, /* de */ 0xc5, /* ie */ 0xd6, /* zhe */ 0xda, /* ze */ 0xc9, /* i */ 0xca, /* short i */ 0xcb, /* ka */ 0xcc, /* el */ 0xcd, /* em */ 0xce, /* en */ 0xcf, /* o */ 0xd0, /* pe */ 0xd2, /* er */ /* 0440 */ 0xd3, /* es */ 0xd4, /* te */ 0xd5, /* u */ 0xc6, /* ef */ 0xc8, /* ha */ 0xc3, /* tse */ 0xde, /* che */ 0xdb, /* sha */ 0xdd, /* shcha */ 0xdf, /* hard sign */ 0xd9, /* yeru */ 0xd8, /* soft sign */ 0xdc, /* e */ 0xc0, /* yu */ 0xd1, /* ya */ 0x00, /* ie grave */ /* 0450 */ 0xa3, /* io */ 0x00, /* dje */ 0x00, /* GJE */ 0xa4, /* UKR ie */ 0x00, /* DZE */ 0xa6, /* BYE/UKR I */ 0xa7, /* YI */ 0x00, /* JE */ 0x00, /* LJE */ 0x00, /* NJE */ 0x00, /* TSHE */ 0x00, /* KJE */ 0x00, /* I grave */ 0x00, /* short U */ 0x00 /* DZHE */ }; /* * Europe to Latin-2 translation table (starts at U+0100). */ const u_int8_t unicode_to_latin2[] = { 0x00, /* A macron */ /* 0100 */ 0x00, /* a macron */ 0xc3, /* A breve */ 0xe3, /* a breve */ 0xa1, /* A ogonek */ 0xb1, /* a ogonek */ 0xc6, /* C acute */ 0xe6, /* c acute */ 0x00, /* C circumflex */ 0x00, /* c circumflex */ 0x00, /* C abovering */ 0x00, /* c abovering */ 0xc8, /* C caron */ 0xe8, /* c caron */ 0xcf, /* D caron */ 0xef, /* d caron */ 0xd0, /* D stroke */ /* 0110 */ 0xf0, /* d stroke */ 0x00, /* E macron */ 0x00, /* e macron */ 0x00, /* E breve */ 0x00, /* e breve */ 0x00, /* E abovering */ 0x00, /* e abovering */ 0xca, /* E ogonek */ 0xea, /* e ogonek */ 0xcc, /* E caron */ 0xec, /* e caron */ 0x00, /* G circumflex */ 0x00, /* g circumflex */ 0x00, /* G breve */ 0x00, /* g breve */ 0x00, /* G abovering */ /* 0120 */ 0x00, /* g abovering */ 0x00, /* G cedilla */ 0x00, /* g cedilla */ 0x00, /* H circumflex */ 0x00, /* h circumflex */ 0x00, /* H stroke */ 0x00, /* h stroke */ 0x00, /* I tilde */ 0x00, /* i tilde */ 0x00, /* I macron */ 0x00, /* i macron */ 0x00, /* I breve */ 0x00, /* i breve */ 0x00, /* I ogonek */ 0x00, /* i ogonek */ 0x00, /* dotted I */ /* 0130 */ 0x00, /* non-dotted i */ 0x00, /* ligature IJ */ 0x00, /* ligature ij */ 0x00, /* J circumflex */ 0x00, /* j circumflex */ 0x00, /* K cedilla */ 0x00, /* k cedilla */ 0x00, /* kra */ 0xc5, /* L acute */ 0xe5, /* l acute */ 0x00, /* L cedilla */ 0x00, /* l cedilla */ 0xa5, /* L caron */ 0xb5, /* l caron */ 0x00, /* L middle dot */ 0x00, /* l middle dot */ /* 0140 */ 0xa3, /* L stroke */ 0xb3, /* l stroke */ 0xd1, /* N acute */ 0xf1, /* n acute */ 0x00, /* N cedilla */ 0x00, /* n cedilla */ 0xd2, /* N caron */ 0xf2, /* n caron */ 0x00, /* N preceded by apostrophe */ 0x00, /* ENG */ 0x00, /* eng */ 0x00, /* O macron */ 0x00, /* o macron */ 0x00, /* O breve */ 0x00, /* o breve */ 0xd5, /* O double acute */ /* 0150 */ 0xf5, /* o double acute */ 0x00, /* ligature OE */ 0x00, /* ligature oe */ 0xc0, /* R acute */ 0xe0, /* r acute */ 0x00, /* R cedilla */ 0x00, /* r cedilla */ 0xd8, /* R caron */ 0xf8, /* r caron */ 0xa6, /* S acute */ 0xb6, /* s acute */ 0x00, /* S circumflex */ 0x00, /* s circumflex */ 0xaa, /* S cedilla */ 0xba, /* s cedilla */ 0xa9, /* S caron */ /* 0160 */ 0xb9, /* s caron */ 0xde, /* T cedilla */ 0xfe, /* t cedilla */ 0xab, /* T caron */ 0xbb, /* t caron */ 0x00, /* T stroke */ 0x00, /* t stroke */ 0x00, /* U tilde */ 0x00, /* u tilde */ 0x00, /* U macron */ 0x00, /* u macron */ 0x00, /* U breve */ 0x00, /* u breve */ 0xd9, /* U abovering */ 0xf9, /* u abovering */ 0xdb, /* U double acute */ /* 0170 */ 0xfb, /* u double acute */ 0x00, /* U ogonek */ 0x00, /* u ogonek */ 0x00, /* W circumflex */ 0x00, /* w circumflex */ 0x00, /* Y circumflex */ 0x00, /* y circumflex */ 0x00, /* Y diaeresis */ 0xac, /* Z acute */ 0xbc, /* z acute */ 0xaf, /* Z abovering */ 0xbf, /* z abovering */ 0xae, /* Z caron */ 0xbe, /* z caron */ 0x00 /* long s */ }; /* * Baltic to Latin-7 translation table. */ const u_int8_t unicode_to_latin7[] = { 0xc2, /* A macron */ /* 0100 */ 0xe2, /* a macron */ 0x00, /* A breve */ 0x00, /* a breve */ 0xc0, /* A ogonek */ 0xe0, /* a ogonek */ 0xc3, /* C acute */ 0xe3, /* c acute */ 0x00, /* C circumflex */ 0x00, /* c circumflex */ 0x00, /* C abovering */ 0x00, /* c abovering */ 0xc8, /* C caron */ 0xe8, /* c caron */ 0x00, /* D caron */ 0x00, /* d caron */ 0x00, /* D stroke */ /* 0110 */ 0x00, /* d stroke */ 0xc7, /* E macron */ 0xe7, /* e macron */ 0x00, /* E breve */ 0x00, /* e breve */ 0xcb, /* E abovering */ 0xeb, /* e abovering */ 0xc6, /* E ogonek */ 0xe6, /* e ogonek */ 0x00, /* E caron */ 0x00, /* e caron */ 0x00, /* G circumflex */ 0x00, /* g circumflex */ 0x00, /* G breve */ 0x00, /* g breve */ 0x00, /* G abovering */ /* 0120 */ 0x00, /* g abovering */ 0xcc, /* G cedilla */ 0xec, /* g cedilla */ 0x00, /* H circumflex */ 0x00, /* h circumflex */ 0x00, /* H stroke */ 0x00, /* h stroke */ 0x00, /* I tilde */ 0x00, /* i tilde */ 0xce, /* I macron */ 0xee, /* i macron */ 0x00, /* I breve */ 0x00, /* i breve */ 0xc1, /* I ogonek */ 0xe1, /* i ogonek */ 0x00, /* dotted I */ /* 0130 */ 0x00, /* non-dotted I */ 0x00, /* ligature IJ */ 0x00, /* ligature ij */ 0x00, /* J circumflex */ 0x00, /* j circumflex */ 0xcd, /* K cedilla */ 0xed, /* k cedilla */ 0x00, /* kra */ 0x00, /* L acute */ 0x00, /* l acute */ 0xcf, /* L cedilla */ 0xef, /* l cedilla */ 0x00, /* L caron */ 0x00, /* l caron */ 0x00, /* L middle dot */ 0x00, /* l middle dot */ /* 0140 */ 0xd9, /* L stroke */ 0xf9, /* l stroke */ 0xd1, /* N acute */ 0xf1, /* n acute */ 0xd2, /* N cedilla */ 0xf2, /* n cedilla */ 0x00, /* N caron */ 0x00, /* n caron */ 0x00, /* N preceded by apostrophe */ 0x00, /* ENG */ 0x00, /* eng */ 0xd4, /* O macron */ 0xf4, /* o macron */ 0x00, /* O breve */ 0x00, /* o breve */ 0x00, /* O double acute */ /* 0150 */ 0x00, /* o double acute */ 0x00, /* ligature OE */ 0x00, /* ligature oe */ 0x00, /* R acute */ 0x00, /* r acute */ 0xaa, /* R cedilla */ 0xba, /* r cedilla */ 0x00, /* R caron */ 0x00, /* r caron */ 0xda, /* S acute */ 0xfa, /* s acute */ 0x00, /* S circumflex */ 0x00, /* s circumflex */ 0x00, /* S cedilla */ 0x00, /* s cedilla */ 0xd0, /* S caron */ /* 0160 */ 0xf0, /* s caron */ 0x00, /* T cedilla */ 0x00, /* t cedilla */ 0x00, /* T caron */ 0x00, /* t caron */ 0x00, /* T stroke */ 0x00, /* t stroke */ 0x00, /* U tilde */ 0x00, /* u tilde */ 0xdb, /* U macron */ 0xfb, /* u macron */ 0x00, /* U breve */ 0x00, /* u breve */ 0x00, /* U abovering */ 0x00, /* u abovering */ 0x00, /* U double acute */ /* 0170 */ 0x00, /* u double acute */ 0xd8, /* U ogonek */ 0xf8, /* u ogonek */ 0x00, /* W circumflex */ 0x00, /* w circumflex */ 0x00, /* Y circumflex */ 0x00, /* y circumflex */ 0x00, /* Y diaeresis */ 0xca, /* Z acute */ 0xea, /* z acute */ 0xdd, /* Z abovering */ 0xfd, /* z abovering */ 0xde, /* Z caron */ 0xfe, /* z caron */ 0x00 /* long s */ }; /* * Keysym to local 8-bit charset sequence translation function. * The out buffer is at least one character long. * The keyboard layout is used as a hint to decide which latin charset to * assume. */ int wsemul_local_translate(u_int32_t unisym, kbd_t layout, u_char *out) { switch (unisym >> 7) { case 0x0080 >> 7: switch (KB_ENCODING(layout)) { case KB_LT: case KB_LV: switch (unisym) { case KS_L7_AE: unisym = 0xaf; break; case KS_L7_Ostroke: unisym = 0xa8; break; case KS_L7_ae: unisym = 0xbf; break; case KS_L7_ostroke: unisym = 0xb8; break; } } break; case 0x0100 >> 7: switch (KB_ENCODING(layout)) { case KB_LT: case KB_LV: if (unisym < 0x100 + nitems(unicode_to_latin7) && unicode_to_latin7[unisym - 0x100] != 0) unisym = unicode_to_latin7[unisym - 0x100]; break; case KB_TR: switch (unisym) { case KS_L5_Gbreve: unisym = 0xd0; break; case KS_L5_gbreve: unisym = 0xf0; break; case KS_L5_Idotabove: unisym = 0xdd; break; case KS_L5_idotless: unisym = 0xfd; break; case KS_L5_Scedilla: unisym = 0xde; break; case KS_L5_scedilla: unisym = 0xfe; break; } break; case KB_PL: case KB_SI: if (unisym < 0x100 + nitems(unicode_to_latin2) && unicode_to_latin2[unisym - 0x100] != 0) unisym = unicode_to_latin2[unisym - 0x100]; break; } break; case 0x0280 >> 7: switch (KB_ENCODING(layout)) { case KB_PL: case KB_SI: switch (unisym) { case KS_L2_caron: unisym = 0xb7; break; case KS_L2_breve: unisym = 0xa2; break; case KS_L2_dotabove: unisym = 0xff; break; case KS_L2_ogonek: unisym = 0xb2; break; case KS_L2_dblacute: unisym = 0xbd; break; } break; } break; case 0x0400 >> 7: if (unisym < 0x400 + sizeof(cyrillic_to_koi8) / sizeof(cyrillic_to_koi8[0]) && cyrillic_to_koi8[unisym - 0x400] != 0) unisym = cyrillic_to_koi8[unisym - 0x400]; break; case 0x0480 >> 7: if (unisym == KS_Cyrillic_GHEUKR) unisym = 0xbd; /* ukrainian GHE */ else if (unisym == KS_Cyrillic_gheukr) unisym = 0xad; /* ukrainian ghe */ break; case 0x2000 >> 7: switch (KB_ENCODING(layout)) { case KB_LT: case KB_LV: switch (unisym) { case KS_L7_rightsnglquot: unisym = 0xff; break; case KS_L7_leftdblquot: unisym = 0xb4; break; case KS_L7_rightdblquot: unisym = 0xa1; break; case KS_L7_dbllow9quot: unisym = 0xa5; break; } } break; } out[0] = unisym & 0xff; return (1); } /* * Keysym to UTF-8 sequence translation function. * The out buffer is at least 4 characters long. */ int wsemul_utf8_translate(u_int32_t unisym, kbd_t layout, u_char *out, int allow_utf8) { #ifndef HAVE_UTF8_SUPPORT return (wsemul_local_translate(unisym, layout, out)); #else u_int pos, length, headpat; if (!allow_utf8) return wsemul_local_translate(unisym, layout, out); if (unisym < 0x80) { /* Fast path for plain ASCII characters. */ *out = (u_char)unisym; return 1; } if (unisym < 0x800) { headpat = 0xc0; length = 2; } else if (unisym < 0x10000) { if (unisym >= 0xd800 && unisym < 0xe000) return 0; headpat = 0xe0; length = 3; } else { if (unisym >= 0x110000) return 0; headpat = 0xf0; length = 4; } for (pos = length - 1; pos > 0; pos--) { out[pos] = 0x80 | (unisym & 0x3f); unisym >>= 6; } out[0] = headpat | unisym; return length; #endif }