[pgsql-jp: 37893] Re: lc_collat=Cだとto_tsqueryに日本語が使えない
Tatsuo Ishii
ishii @ sraoss.co.jp
2007年 1月 1日 (月) 23:32:08 JST
石井です.
> > こんにちは、ヨコロッパと申します
> >
> > PostgreSQL 8.2 + GIN + tsearch2 + mecab で
> > 日本語の全文検索にチャレンジしています。
> >
> > はじめは順調にできたのですが、日本語のソートをするために
> > --no-locale付きでinitdb しなおしたところ
> > to_tsquery() に日本語が使えなくなってしまいました
> >
> > なにか心当たりのある方いらっしゃいますでしょうか
>
> tsearch2はCロケールでは日本語が使えません.これはtsearch2がwcstombsな
> どのワイド文字関数にずぶずぶに依存した実装になっているからです.なんと
> か直そうとは思い,努力しているところです:-)
パッチを作ってみました.テストに協力してくださる方募集します.
CVS head用のパッチですが,8.2.0にも問題なくあたるようです.
contrib/tsearch2の下で,
patch -p0 -b < パッチファイル
としてあててから,tsearchをmake clean;make installしてください.
tsearch2.sqlを再実行する必要はありません.
----------------------- cut here ---------------------------------
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
--- ts_locale.c 1 Jan 2007 12:22:50 -0000
***************
*** 63,68 ****
--- 63,101 ----
return mbstowcs(to, from, len);
}
+
+ #else /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ wchar_t *result;
+ size_t n;
+
+ if (to == NULL)
+ return 0;
+
+ if (lc_ctype_is_c)
+ {
+ /* allocate neccesary memory for "to" including NULL terminate */
+ result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+
+ /* do the conversion */
+ n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ if (n > 0)
+ {
+ /* store the result */
+ if (n > len)
+ n = len;
+ memcpy(to, result, n*sizeof(wchar_t));
+ pfree(result);
+ *(to + n) = '\0';
+ }
+ return n;
+ }
+ return mbstowcs(to, from, len);
+ }
+
#endif /* WIN32 */
int
***************
*** 70,75 ****
--- 103,113 ----
{
wchar_t character;
+ if (lc_ctype_is_c)
+ {
+ return isalpha(TOUCHAR(ptr));
+ }
+
char2wchar(&character, ptr, 1);
return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
{
wchar_t character;
+ if (lc_ctype_is_c)
+ {
+ return isprint(TOUCHAR(ptr));
+ }
+
char2wchar(&character, ptr, 1);
return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 195,201 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
--- ts_locale.h 1 Jan 2007 12:22:50 -0000
***************
*** 38,45 ****
#else /* WIN32 */
/* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
#else /* WIN32 */
/* correct mbstowcs */
#define wchar2char wcstombs
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
* t_iseq() should be called only for ASCII symbols
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
#define COPYCHAR(d,s) do { \
int lll = pg_mblen( s ); \
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
--- wordparser/parser.c 1 Jan 2007 12:22:51 -0000
***************
*** 44,52 ****
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
+ *
+ * This is wrong assumption. even if locale is C, multibyte is necceary.
*/
! if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
--- 94,102 ----
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
! return ( ( prs->usewide ) ? \
! (lc_ctype_is_c? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
***************
*** 134,141 ****
}
#endif /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 138,197 ----
}
#endif /* TS_USE_WIDE */
! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c)
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c)
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
----------------------- cut here ---------------------------------
pgsql-jp メーリングリストの案内