Functions.cpp

/* */
This source file includes following definitions.
config_load
exp_tokenizer
corpus_tokenizer
emphword
checkfno
calckind
checkquery
escapetag
ssclear
term
   1 /* -*- coding: utf-8; mode: c++; -*-
   2  * Concordance to A. S. Pushkin's Works - Functions
   3  * $Id: Functions.cpp 21 2013-11-04 18:27:11Z isao $
   4  * Copyright (C) 2012, isao yasuda
   5  */
   6 
   7 #include <boost/tokenizer.hpp>
   8 #include <boost/regex.hpp>
   9 #include <boost/regex/icu.hpp>
  10 #include <iostream>
  11 #include <fstream>
  12 #include <sstream>
  13 #include <cstdlib>
  14 #include <csignal>
  15 #include "Query.hpp"
  16 
  17 typedef std::pair<std::string, std::string> string_type;
  18 
  19 // 強調表示挿入文字列
  20 std::string lopen = "<span class=\"hl\">"; // ヒット行ハイライト開始
  21 std::string lclos = "</span>";             // ヒット行ハイライト終了
  22 std::string eopen = "<em class=\"hit\">";  // ヒット語ハイライト開始
  23 std::string eclos = "</em>";               // ヒット語ハイライト終了
  24 // 近接隣接距離 MAX 値
  25 int MAXDIST = 1000;
  26 
  27 // 構成ファイルロード: Key="Mapped" から Key と Mapped を切り出し map 登録
  28 void config_load(const char* fn, std::map<std::string, std::string>& cm)
  29 {
  30     boost::regex  r1("(^\\s*#.*|^\\s*$)");
  31     boost::regex  r2("([\\w0-9]+)[^=]*=[^\\w0-9\"]*\"?([^\"]*)\"?");
  32     boost::smatch m;
  33     try {
  34         std::string   buf;
  35         std::ifstream ifs(fn);
  36         while (ifs && std::getline(ifs, buf)) {
  37             // コメント # 行または空行は読み飛ばす
  38             if (boost::regex_match(buf, m, r1))
  39                 continue;
  40             // パターンマッチするものを map に格納
  41             if (boost::regex_match(buf, m, r2)) {
  42                 string_type value(m.str(1), m.str(2));
  43                 cm.insert(value);
  44             } else {
  45                 std::cerr << "Unavailable line: " << buf << "\n";
  46             }
  47         }
  48     } catch (std::exception& e) {
  49         std::cerr << "Configuration parse error: " << e.what() << "\n";
  50     }
  51 }
  52 
  53 // ユーザ入力式分割
  54 void exp_tokenizer(std::string& line, std::vector<std::string>& sv)
  55 {
  56     typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
  57     // 正規表現メタ文字，<> (近接隣接演算子) では分割しない
  58     boost::char_separator<char> sep("\t _:;!~`\"&%@/",
  59                                     "", boost::drop_empty_tokens);
  60     tokenizer tok(line, sep);
  61     for(tokenizer::iterator ti = tok.begin(); ti != tok.end(); ti++)
  62         sv.push_back(*ti);
  63 }
  64 
  65 // コーパス・トークナイザ: corpus 行を単語分割して，vector に格納
  66 // 8 bit の区切り文字 «, » などには未対応 (corpus 作成プログラムで変換しておく)
  67 void corpus_tokenizer(std::string& line, std::vector<std::string>& sv)
  68 {
  69     typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
  70     // 区切り文字定義: -' では分割しない
  71     boost::char_separator<char> sep("\t _:;.,?!~`\"\\[]{}()*&^%$#@+=|<>/",
  72                                     "", boost::drop_empty_tokens);
  73     tokenizer tok(line, sep);
  74     for(tokenizer::iterator ti = tok.begin(); ti != tok.end(); ti++)
  75         sv.push_back(*ti);
  76 }
  77 
  78 // ヒット行／単語テクスト強調装飾
  79 // - l0: current line; l1: pre line; l2: after line;
  80 // - o1: context pre line; o2: context hit word and after line;
  81 // - pos: hit word position address
  82 void emphword(std::string& l0, std::string& l1, std::string& l2,
  83               std::string& o1, std::string& o2, int pos,
  84               int cprelen, int caftlen)
  85 {
  86     // 実体参照 &lt;(<),&gt;(>),&quot;(") を一時的に記号に置換する
  87     std::string bufo(l0);
  88     boost::regex rp1("&lt;");
  89     boost::regex rp2("&gt;");
  90     boost::regex rp3("&quot;");
  91     bufo = boost::regex_replace(bufo, rp1, "<");
  92     bufo = boost::regex_replace(bufo, rp2, ">");
  93     bufo = boost::regex_replace(bufo, rp3, "\"");
  94 
  95     // current (単語ヒット) 行編集バッファ
  96     char* buf = new char[bufo.size()+1];
  97     std::strcpy(buf, bufo.c_str());
  98 
  99     // 正規表現 iterator で単語マッチを pos まで繰り返し，単語と位置を取得する
 100     // §†¼½⅓№ 以外に単語として扱うべき約物が corpus にないことを確認しておくこと
 101     boost::u32regex r(boost::make_u32regex("([§†¼½⅓№0-9\\-'\\w]+)"));
 102     boost::utf8regex_iterator i(boost::make_u32regex_iterator(buf, r));
 103     boost::utf8regex_iterator j;
 104     int k = 0;      // 行頭からの語数
 105     std::string ws; // 強調すべき語
 106     int wpt;        // その位置
 107     while (i !=j) {
 108         if (k++ == pos) {
 109             ws = (*i)[0];          // pos 番目の単語を取得
 110             wpt = (*i).position(); // pos 番目の語位置を取得
 111             break;
 112         }
 113         i++;
 114     }
 115     std::string s0 = bufo.substr(0, wpt);          // ヒット語前テキスト
 116     std::string s1 = bufo.substr(wpt + ws.size()); // ヒット語後テキスト
 117 
 118     // 指定文字数でコンテクスト長を制限
 119     icu::UnicodeString uspre = icu::UnicodeString::fromUTF8(s0);
 120     icu::UnicodeString usaft = icu::UnicodeString::fromUTF8(s1);
 121     icu::UnicodeString usl1  = icu::UnicodeString::fromUTF8(l1);
 122     icu::UnicodeString usl2  = icu::UnicodeString::fromUTF8(l2);
 123     icu::UnicodeString usws  = icu::UnicodeString::fromUTF8(ws);
 124 
 125     // 前テキスト編集
 126     if (cprelen > 0) {
 127         int s0len = uspre.length(); // 前テキスト文字数
 128         if (s0len > cprelen) {
 129             uspre = uspre.tempSubString((s0len - cprelen), cprelen);
 130             uspre = ".." + uspre;
 131             usl1 = "";
 132         } else {
 133             if ((usl1.length() + s0len) > cprelen) {
 134                 usl1 = usl1.tempSubString
 135                        ((usl1.length() - (cprelen - s0len)), (cprelen - s0len));
 136                 usl1 = ".." + usl1;
 137             }
 138         }
 139     }
 140     // 後テキスト編集
 141     if (caftlen > 0) {
 142         int s1len = usaft.length(); // 後テキスト文字数
 143         int aftmax = caftlen - usws.length(); // ヒット語を除く最大後テキスト文字数
 144         if (s1len > aftmax) {
 145             usaft = usaft.tempSubString(0, aftmax) + "..";
 146             usl2 = "";
 147         } else {
 148             if ((s1len + usl2.length()) > aftmax) {
 149                 usl2 = usl2.tempSubString(0, (aftmax - s1len)) + "..";
 150             }
 151         }
 152     }
 153     std::string s0t;
 154     std::string s1t;
 155     std::string l1t;
 156     std::string l2t;
 157     s0 = uspre.toUTF8String(s0t);
 158     s1 = usaft.toUTF8String(s1t);
 159     l1 = usl1.toUTF8String(l1t);
 160     l2 = usl2.toUTF8String(l2t);
 161 
 162     // <,>," を実体参照に戻す
 163     boost::regex ro1("<");
 164     boost::regex ro2(">");
 165     boost::regex ro3("\"");
 166     s0 = boost::regex_replace(s0, ro1, "&lt;");
 167     s0 = boost::regex_replace(s0, ro2, "&gt;");
 168     s0 = boost::regex_replace(s0, ro3, "&quot;");
 169     s1 = boost::regex_replace(s1, ro1, "&lt;");
 170     s1 = boost::regex_replace(s1, ro2, "&gt;");
 171     s1 = boost::regex_replace(s1, ro3, "&quot;");
 172 
 173     // 強調装飾を施し，pre l1 - cur l0 - aft l2 を連結し，l3 元文字列に上書き
 174     if (pos) {
 175         // ヒット語が cur 行先頭以外の場合
 176         o1 = l1 + " " + lopen + s0 + lclos;
 177         o2 = eopen + ws + eclos + lopen + s1 + lclos + " " + l2;
 178     } else {
 179         // ヒット語が cur 行先頭の場合
 180         o1 = l1 + " " + lopen + s0 + lclos;
 181         o2 = lopen + eopen + ws + eclos + s1 + lclos + " " + l2;
 182     }
 183 
 184     // 動的確保文字列領域を解放
 185     delete [] buf;
 186 }
 187 
 188 // ファイル番号同値チェック: Corpus ファイル番号を取り出し，同一かチェックする
 189 bool checkfno(const char* cs, int fno)
 190 {
 191     std::string s(cs);
 192     std::string sn;
 193 
 194     // 一個目と二個目のコロンに挟まれた数値文字列を取得
 195     std::string::size_type j = s.find(':');
 196     std::string::size_type i = ++j;
 197     j = s.find(':', j);
 198     s = s.substr(i, j-i);
 199 
 200     // 与えられた fno と比較し一致したら真
 201     if ((std::atoi(s.c_str())) == fno)
 202         return true;   // 同一
 203     else
 204         return false;  // 不一致
 205 }
 206 
 207 // 完全一致／正規表現演算種別判別
 208 // - 完全一致／正規表現演算の種別を判定し，コール元の kind に種別をセットする
 209 // - ".*" 等の記号のみからなる式の場合エラーとする
 210 // - ジャンル指定により 1 文字しか含まれないものをエラーとする
 211 int calckind(std::string& s, std::vector<int>& gv, int& kind)
 212 {
 213     // 正規表現かどうかチェックするための正規表現
 214     boost::u32regex r(boost::make_u32regex("[\\-+*^$.|,?()\\[\\]{}\\\\]"));
 215     // 記号だけパターン
 216     boost::u32regex r1(boost::make_u32regex("^\\W+$"));
 217     // 通常文字 2 文字含むパターン
 218     boost::u32regex r2(boost::make_u32regex("(.*\\w.*\\w.*|^\\w+$)"));
 219     // マッチ結果
 220     boost::smatch   m, m1;
 221 
 222     // 記号だけ
 223     if (boost::u32regex_match(s, m, r1))
 224         return 1;     // 文字種エラー
 225 
 226     // ジャンル選択によって 2 文字以内のもの
 227     if ((gv.size() == 0) || (gv.size() > 7)) {
 228         if (! boost::u32regex_match(s, m1, r2)) {
 229             return 2; // 文字数エラー
 230         }
 231     }
 232 
 233     // 演算種別セット
 234     if (boost::u32regex_search(s, m, r))
 235         kind = 1;     // 正規表現探索
 236     else
 237         kind = 0;     // 完全一致探索
 238 
 239     return 0;         // 正常終了
 240 }
 241 
 242 // クエリチェック: 検査結果によりリターンコードをセット
 243 // - 正常: 0
 244 // - 記号(正規表現)だけのもの(大量ヒット抑止): 1 (calckind 使用)
 245 // - все 選択時，もしくは 8 ジャンル以上選択時 1 文字の正規表現: 2 (calckind 使用)
 246 // - 近接隣接演算種別 W|L 以外: 3
 247 // - 近接隣接演算距離 MAXDIST 以上もしくは 0: 4
 248 // - 近接隣接演算フォーマット <> 含むのに前後ワードが切り出せない: 5
 249 int checkquery(std::string& s, std::vector<int>& gv, std::vector<query>& qv)
 250 {
 251     // クエリ
 252     query q;    // クエリ構造体
 253     q.qsrc = s; // 入力式そのものをセット
 254     int rt;     // return code
 255     // 近接隣接演算オペレータ／オペランド分割用正規表現
 256     boost::u32regex r3(boost::make_u32regex("([^<]+)<([LW])([0-9]+)>(.+)"));
 257     // 近接隣接演算チェック用
 258     boost::regex    r4("[<>]"); // < | > 含むかをチェックする
 259     boost::regex    r5("<.*");  // < より右の部分削除用(boost.regex バグ対応)
 260     boost::smatch   m;          // マッチ結果
 261 
 262     // 近接隣接隣接演算チェック
 263     if (boost::regex_search(s, m, r4)) {
 264         if (boost::u32regex_match(s.c_str(), m, r3)) {
 265             // 検査ワード，演算子，近接隣接ワードを分解
 266             std::string wm[4];
 267             wm[1] = m[2].str(); // 近接隣接演算種別 L(行) or W(語)
 268             wm[2] = m[3].str(); // 近接隣接演算距離
 269             wm[3] = m[4].str(); // 近接隣接演算対照ワード
 270             wm[0] = boost::regex_replace(s, r5, ""); // 近接隣接演算検査ワード
 271             // 検査ワードセット
 272             q.qexp1 = wm[0];
 273             rt = calckind(wm[0], gv, q.kind1);
 274             if (rt) return rt;  // 演算種別チェックのリターン 1 | 2
 275             // 近接隣接ワードセット
 276             q.qexp2 = wm[3];
 277             rt = calckind(wm[3], gv, q.kind2);
 278             if (rt) return rt;  // 演算種別チェックのリターン 1 | 2
 279             // 近接隣接演算種別セット
 280             if (wm[1] == "W")
 281                 q.adjk = 1;     // W をセット
 282             else if (wm[1] == "L")
 283                 q.adjk = 2; // L をセット
 284             else
 285                 return 3;   // 近接隣接演算種別エラー
 286             // 近接隣接演算距離セット
 287             int dist = std::atoi(wm[2].c_str());
 288             if ((dist > MAXDIST) || (dist == 0))
 289                 return 4;       // 近接隣接距離エラー
 290             else
 291                 q.adjd = dist;  // 距離をセット
 292         } else {
 293             return 5;           // 近接隣接演算フォーマットエラー
 294         }
 295     } else {
 296         // 非近接隣接演算
 297         rt = calckind(s, gv, q.kind1);
 298         if (rt) return rt;      // 演算種別チェックのリターン 1 | 2
 299         q.adjk  = 0;  // 近接隣接演算ではない
 300         q.adjd  = 0;  // 距離 0 をセット
 301         q.qexp1 = s;  // 探索対象ワード
 302         q.qexp2 = ""; // 使用しないエリア
 303         q.kind2 = 0;  // 使用しないエリア
 304     }
 305 
 306     // クエリを query vector に格納
 307     qv.push_back(q);
 308 
 309     return 0;                   // 正常終了
 310 }
 311 
 312 // タグ，dblquo のエスケープ
 313 // - もともと &lt; だったりすると誤動作する: corpus 生成側で対処済前提
 314 void escapetag(std::string& s)
 315 {
 316     boost::regex r0, r1, r2, r3; // 正規表現パターン
 317     r0 = boost::regex("[&]");
 318     r1 = boost::regex("[<]");
 319     r2 = boost::regex("[>]");
 320     r3 = boost::regex("[\"]");
 321     s = boost::regex_replace(s, r0, "&amp;");
 322     s = boost::regex_replace(s, r1, "&lt;");
 323     s = boost::regex_replace(s, r2, "&gt;");
 324     s = boost::regex_replace(s, r3, "&quot;");
 325 }
 326 
 327 // ストリームのクリア
 328 void ssclear(std::stringstream& ss)
 329 {
 330     static const std::string empty("");
 331     ss.str(empty);
 332     ss.clear();
 333 }
 334 
 335 // シグナル・ハンドラ: CorpusLoader, WordTreeBuilder 用
 336 void term(int n)
 337 {
 338     std::cerr << "Signal caught " << n << "\n";
 339     exit(1);
 340 }
/* */
root/Functions.cpp

DEFINITIONS