#include "unicode/regex.h" #include "unicode/ucnv.h" #ifdef _DEBUG #ifdef _WIN64 #pragma comment(lib,"icuin64d.lib") #pragma comment(lib,"icuuc64d.lib") #else #pragma comment(lib,"icuin32d.lib") #pragma comment(lib,"icuuc32d.lib") #endif #else #ifdef _WIN64 #pragma comment(lib,"icuin64.lib") #pragma comment(lib,"icuuc64.lib") #else #pragma comment(lib,"icuin32.lib") #pragma comment(lib,"icuuc32.lib") #endif #endif //每次匹配的内容大小为1M #define CONTENT_SPLIT_MAX_LEN 1024*1024 // int FindSubNum(UnicodeString USrcStr,UnicodeString USubStr,int index) { int32_t num = 0; int pos = USrcStr.indexOf(USubStr); while(pos != -1) { num++; pos += index; pos = USrcStr.indexOf(USubStr,pos); } return num; } extern "C"SP_DLP_DLLEXPORT int findKeyPhraseReg(char* buf,char *pat_str,UnicodeString keyphrase) { if(NULL == buf || NULL == pat_str) { return 0; } UConverter *cv = NULL; UErrorCode status = U_ZERO_ERROR; int32_t buf_len = strlen(buf); int32_t pat_str_len = strlen(pat_str); RegexPattern *reg_pattern = NULL;///正则表达式 cv = ucnv_open("utf-8"/*detectCode*/,&status); if (U_FAILURE(status)) { ucnv_close(cv); return 0; } //转换模式串为UnicodeString UChar* subStr = new UChar[pat_str_len + 1]; memset(subStr,(pat_str_len + 1)*2); ucnv_toUChars(cv,subStr,(pat_str_len+1)*2,pat_str,pat_str_len,&status); if (U_FAILURE(status)) { delete[]subStr; subStr = NULL; ucnv_close(cv); return 0; } ucnv_close(cv); UnicodeString patString(subStr); //释放空间 if (subStr) { delete[]subStr; subStr = NULL; } //Unicode正则表达式组装,这些函数经常代替构造函数来创建RegexPattern对象 reg_pattern = RegexPattern::compile(patString,status); if (U_FAILURE(status)) { return 0; } //把母串转换为Unicode UChar* result = new UChar[CONTENT_SPLIT_MAX_LEN + 1]; memset(result,(CONTENT_SPLIT_MAX_LEN+1)*2); //UChar result[CONTENT_SPLIT_MAX_LEN + 1] = {0}; cv = ucnv_open(/*detected*/"UTF-8",&status); RegexMatcher *reg_matcher = NULL;//匹配器 //此处说明: //1、优点:分批用icu正则匹配待检测内容,每批内容长度为1M。否则当来一个超大文件时,会导致系统不稳定 //2、缺点:这里分批匹配有缺陷,会导致部分数据被分割后,正则表达式匹配不上。漏掉的匹配次数极限值为分割次数。 int index = 0; int matchNum = 0; int src_len = 0; while(index < buf_len) { //将buf中的内容每次1M分批拷入result if (buf_len - index > CONTENT_SPLIT_MAX_LEN) { src_len = CONTENT_SPLIT_MAX_LEN; } else { src_len = buf_len - index; } ucnv_toUChars(cv,result,(CONTENT_SPLIT_MAX_LEN+1)*2,buf + index,src_len,&status); index += src_len; UnicodeString inputString(result); //创建一个正则表达式匹配器 reg_matcher = reg_pattern->matcher(inputString,status); if (U_FAILURE(status)) { delete reg_matcher; reg_matcher = NULL; continue; } //virtual UnicodeString replaceAll(const UnicodeString &replacement,UErrorCode &status) UnicodeString ustr = reg_matcher->replaceAll(UnicodeString(""),status); if (U_FAILURE(status)) { delete reg_matcher; reg_matcher = NULL; continue; } int len = keyphrase.length(); matchNum += FindSubNum(ustr,keyphrase,len); //使用完匹配器后要释放 delete reg_matcher; reg_matcher = NULL; } ucnv_close(cv); if(result) { delete []result; result = NULL; } if (reg_pattern) { delete reg_pattern; reg_pattern = NULL; } return matchNum; }