15 #ifndef RAPIDJSON_INTERNAL_REGEX_H_
16 #define RAPIDJSON_INTERNAL_REGEX_H_
18 #include "../allocators.h"
19 #include "../stream.h"
24 RAPIDJSON_DIAG_OFF(padded)
25 RAPIDJSON_DIAG_OFF(
switch-
enum)
26 RAPIDJSON_DIAG_OFF(implicit-fallthrough)
31 RAPIDJSON_DIAG_OFF(effc++)
33 RAPIDJSON_DIAG_OFF(implicit-fallthrough)
39 RAPIDJSON_DIAG_OFF(4512)
42 #ifndef RAPIDJSON_REGEX_VERBOSE
43 #define RAPIDJSON_REGEX_VERBOSE 0
46 RAPIDJSON_NAMESPACE_BEGIN
87 template <
typename Encoding,
typename Allocator = CrtAllocator>
90 typedef typename Encoding::Ch Ch;
92 GenericRegex(
const Ch* source,
Allocator* allocator = 0) :
93 states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
94 stateSet_(), state0_(allocator, 0), state1_(allocator, 0), anchorBegin_(), anchorEnd_()
96 GenericStringStream<Encoding> ss(source);
97 DecodedStream<GenericStringStream<Encoding> > ds(ss);
102 Allocator::Free(stateSet_);
105 bool IsValid()
const {
106 return root_ != kRegexInvalidState;
109 template <
typename InputStream>
110 bool Match(InputStream& is)
const {
111 return SearchWithAnchoring(is,
true,
true);
114 bool Match(
const Ch* s)
const {
115 GenericStringStream<Encoding> is(s);
119 template <
typename InputStream>
120 bool Search(InputStream& is)
const {
121 return SearchWithAnchoring(is, anchorBegin_, anchorEnd_);
124 bool Search(
const Ch* s)
const {
125 GenericStringStream<Encoding> is(s);
139 static const unsigned kAnyCharacterClass = 0xFFFFFFFF;
140 static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
141 static const unsigned kRangeNegationFlag = 0x80000000;
163 template <
typename SourceStream>
164 class DecodedStream {
166 DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
167 unsigned Peek() {
return codepoint_; }
169 unsigned c = codepoint_;
177 if (!Encoding::Decode(ss_, &codepoint_))
187 return states_.template Bottom<State>()[index];
190 const State& GetState(
SizeType index)
const {
192 return states_.template Bottom<State>()[index];
197 return ranges_.template Bottom<Range>()[index];
200 const Range& GetRange(
SizeType index)
const {
202 return ranges_.template Bottom<Range>()[index];
205 template <
typename InputStream>
206 void Parse(DecodedStream<InputStream>& ds) {
208 Stack<Allocator> operandStack(&allocator, 256);
209 Stack<Allocator> operatorStack(&allocator, 256);
210 Stack<Allocator> atomCountStack(&allocator, 256);
212 *atomCountStack.template Push<unsigned>() = 0;
215 while (ds.Peek() != 0) {
216 switch (codepoint = ds.Take()) {
226 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
227 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
229 *operatorStack.template Push<Operator>() = kAlternation;
230 *atomCountStack.template Top<unsigned>() = 0;
234 *operatorStack.template Push<Operator>() = kLeftParenthesis;
235 *atomCountStack.template Push<unsigned>() = 0;
239 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
240 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
242 if (operatorStack.Empty())
244 operatorStack.template Pop<Operator>(1);
245 atomCountStack.template Pop<unsigned>(1);
246 ImplicitConcatenation(atomCountStack, operatorStack);
250 if (!Eval(operandStack, kZeroOrOne))
255 if (!Eval(operandStack, kZeroOrMore))
260 if (!Eval(operandStack, kOneOrMore))
267 if (!ParseUnsigned(ds, &n))
270 if (ds.Peek() ==
',') {
272 if (ds.Peek() ==
'}')
273 m = kInfinityQuantifier;
274 else if (!ParseUnsigned(ds, &m) || m < n)
280 if (!EvalQuantifier(operandStack, n, m) || ds.Peek() !=
'}')
287 PushOperand(operandStack, kAnyCharacterClass);
288 ImplicitConcatenation(atomCountStack, operatorStack);
294 if (!ParseRange(ds, &range))
296 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
297 GetState(s).rangeStart = range;
298 *operandStack.template Push<Frag>() = Frag(s, s, s);
300 ImplicitConcatenation(atomCountStack, operatorStack);
304 if (!CharacterEscape(ds, &codepoint))
309 PushOperand(operandStack, codepoint);
310 ImplicitConcatenation(atomCountStack, operatorStack);
314 while (!operatorStack.Empty())
315 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
319 if (operandStack.GetSize() ==
sizeof(Frag)) {
320 Frag* e = operandStack.template Pop<Frag>(1);
321 Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
324 #if RAPIDJSON_REGEX_VERBOSE
325 printf(
"root: %d\n", root_);
326 for (
SizeType i = 0; i < stateCount_ ; i++) {
327 State& s = GetState(i);
328 printf(
"[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (
char)s.codepoint);
336 if (stateCount_ > 0) {
337 stateSet_ =
static_cast<unsigned*
>(states_.GetAllocator().Malloc(GetStateSetSize()));
338 state0_.template Reserve<SizeType>(stateCount_);
339 state1_.template Reserve<SizeType>(stateCount_);
344 State* s = states_.template Push<State>();
347 s->codepoint = codepoint;
348 s->rangeStart = kRegexInvalidRange;
349 return stateCount_++;
352 void PushOperand(Stack<Allocator>& operandStack,
unsigned codepoint) {
353 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
354 *operandStack.template Push<Frag>() = Frag(s, s, s);
357 void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
358 if (*atomCountStack.template Top<unsigned>())
359 *operatorStack.template Push<Operator>() = kConcatenation;
360 (*atomCountStack.template Top<unsigned>())++;
365 while (GetState(l1).out != kRegexInvalidState)
366 l1 = GetState(l1).out;
367 GetState(l1).out = l2;
372 for (
SizeType next; l != kRegexInvalidState; l = next) {
373 next = GetState(l).out;
378 bool Eval(Stack<Allocator>& operandStack, Operator op) {
383 Frag e2 = *operandStack.template Pop<Frag>(1);
384 Frag e1 = *operandStack.template Pop<Frag>(1);
385 Patch(e1.out, e2.start);
386 *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
391 if (operandStack.GetSize() >=
sizeof(Frag) * 2) {
392 Frag e2 = *operandStack.template Pop<Frag>(1);
393 Frag e1 = *operandStack.template Pop<Frag>(1);
394 SizeType s = NewState(e1.start, e2.start, 0);
395 *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
401 if (operandStack.GetSize() >=
sizeof(Frag)) {
402 Frag e = *operandStack.template Pop<Frag>(1);
403 SizeType s = NewState(kRegexInvalidState, e.start, 0);
404 *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
410 if (operandStack.GetSize() >=
sizeof(Frag)) {
411 Frag e = *operandStack.template Pop<Frag>(1);
412 SizeType s = NewState(kRegexInvalidState, e.start, 0);
414 *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
421 if (operandStack.GetSize() >=
sizeof(Frag)) {
422 Frag e = *operandStack.template Pop<Frag>(1);
423 SizeType s = NewState(kRegexInvalidState, e.start, 0);
425 *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
432 bool EvalQuantifier(Stack<Allocator>& operandStack,
unsigned n,
unsigned m) {
439 else if (m == kInfinityQuantifier)
440 Eval(operandStack, kZeroOrMore);
442 Eval(operandStack, kZeroOrOne);
443 for (
unsigned i = 0; i < m - 1; i++)
444 CloneTopOperand(operandStack);
445 for (
unsigned i = 0; i < m - 1; i++)
446 Eval(operandStack, kConcatenation);
451 for (
unsigned i = 0; i < n - 1; i++)
452 CloneTopOperand(operandStack);
454 if (m == kInfinityQuantifier)
455 Eval(operandStack, kOneOrMore);
457 CloneTopOperand(operandStack);
458 Eval(operandStack, kZeroOrOne);
459 for (
unsigned i = n; i < m - 1; i++)
460 CloneTopOperand(operandStack);
461 for (
unsigned i = n; i < m; i++)
462 Eval(operandStack, kConcatenation);
465 for (
unsigned i = 0; i < n - 1; i++)
466 Eval(operandStack, kConcatenation);
473 void CloneTopOperand(Stack<Allocator>& operandStack) {
474 const Frag src = *operandStack.template Top<Frag>();
475 SizeType count = stateCount_ - src.minIndex;
476 State* s = states_.template Push<State>(count);
477 memcpy(s, &GetState(src.minIndex), count *
sizeof(State));
478 for (
SizeType j = 0; j < count; j++) {
479 if (s[j].out != kRegexInvalidState)
481 if (s[j].out1 != kRegexInvalidState)
484 *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
485 stateCount_ += count;
488 template <
typename InputStream>
489 bool ParseUnsigned(DecodedStream<InputStream>& ds,
unsigned* u) {
491 if (ds.Peek() <
'0' || ds.Peek() >
'9')
493 while (ds.Peek() >=
'0' && ds.Peek() <=
'9') {
494 if (r >= 429496729 && ds.Peek() >
'5')
496 r = r * 10 + (ds.Take() -
'0');
502 template <
typename InputStream>
503 bool ParseRange(DecodedStream<InputStream>& ds,
SizeType* range) {
507 SizeType start = kRegexInvalidRange;
508 SizeType current = kRegexInvalidRange;
510 while ((codepoint = ds.Take()) != 0) {
513 if (codepoint ==
'^') {
521 if (start == kRegexInvalidRange)
526 GetRange(current).next = r;
529 GetRange(start).start |= kRangeNegationFlag;
534 if (ds.Peek() ==
'b') {
538 else if (!CharacterEscape(ds, &codepoint))
545 if (codepoint ==
'-') {
554 if (current != kRegexInvalidRange)
555 GetRange(current).next = r;
556 if (start == kRegexInvalidRange)
565 GetRange(current).end = codepoint;
573 SizeType NewRange(
unsigned codepoint) {
574 Range* r = ranges_.template Push<Range>();
575 r->start = r->end = codepoint;
576 r->next = kRegexInvalidRange;
577 return rangeCount_++;
580 template <
typename InputStream>
581 bool CharacterEscape(DecodedStream<InputStream>& ds,
unsigned* escapedCodepoint) {
583 switch (codepoint = ds.Take()) {
598 *escapedCodepoint = codepoint;
return true;
599 case 'f': *escapedCodepoint = 0x000C;
return true;
600 case 'n': *escapedCodepoint = 0x000A;
return true;
601 case 'r': *escapedCodepoint = 0x000D;
return true;
602 case 't': *escapedCodepoint = 0x0009;
return true;
603 case 'v': *escapedCodepoint = 0x000B;
return true;
609 template <
typename InputStream>
610 bool SearchWithAnchoring(InputStream& is,
bool anchorBegin,
bool anchorEnd)
const {
612 DecodedStream<InputStream> ds(is);
615 Stack<Allocator> *current = &state0_, *next = &state1_;
616 const size_t stateSetSize = GetStateSetSize();
617 std::memset(stateSet_, 0, stateSetSize);
619 bool matched = AddState(*current, root_);
621 while (!current->Empty() && (codepoint = ds.Take()) != 0) {
622 std::memset(stateSet_, 0, stateSetSize);
625 for (
const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
626 const State& sr = GetState(*s);
627 if (sr.codepoint == codepoint ||
628 sr.codepoint == kAnyCharacterClass ||
629 (sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
631 matched = AddState(*next, sr.out) || matched;
632 if (!anchorEnd && matched)
636 AddState(*next, root_);
638 internal::Swap(current, next);
644 size_t GetStateSetSize()
const {
645 return (stateCount_ + 31) / 32 * 4;
649 bool AddState(Stack<Allocator>& l,
SizeType index)
const {
652 const State& s = GetState(index);
653 if (s.out1 != kRegexInvalidState) {
654 bool matched = AddState(l, s.out);
655 return AddState(l, s.out1) || matched;
657 else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
658 stateSet_[index >> 5] |= (1 << (index & 31));
659 *l.template PushUnsafe<SizeType>() = index;
661 return s.out == kRegexInvalidState;
664 bool MatchRange(
SizeType rangeIndex,
unsigned codepoint)
const {
665 bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0;
666 while (rangeIndex != kRegexInvalidRange) {
667 const Range& r = GetRange(rangeIndex);
668 if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end)
675 Stack<Allocator> states_;
676 Stack<Allocator> ranges_;
681 static const unsigned kInfinityQuantifier = ~0u;
685 mutable Stack<Allocator> state0_;
686 mutable Stack<Allocator> state1_;
691 typedef GenericRegex<UTF8<> > Regex;
694 RAPIDJSON_NAMESPACE_END
704 #endif // RAPIDJSON_INTERNAL_REGEX_H_