Documentation
¶
Overview ¶
Package nfa provides a Thompson NFA (Non-deterministic Finite Automaton) implementation for regex matching.
This package implements the core Thompson NFA algorithm along with a PikeVM execution engine. The NFA is compiled from regexp/syntax.Regexp patterns and can be used for matching with full support for capturing groups (future).
Index ¶
- Variables
- func ContainsDot(re *syntax.Regexp) bool
- func ExtractCharClassRanges(re *syntax.Regexp) [][2]byte
- func HasImpossibleEndAnchor(re *syntax.Regexp) bool
- func IsBranchDispatchPattern(re *syntax.Regexp) bool
- func IsCompositeCharClassPattern(re *syntax.Regexp) bool
- func IsCompositeSequenceDFAPattern(re *syntax.Regexp) bool
- func IsPatternEndAnchored(re *syntax.Regexp) bool
- func IsPatternStartAnchored(re *syntax.Regexp) bool
- func IsSimpleCharClassPlus(re *syntax.Regexp) bool
- func PatternHasUTF8Dependence(re *syntax.Regexp) bool
- type BacktrackerState
- type BoundedBacktracker
- func (b *BoundedBacktracker) CanHandle(haystackLen int) bool
- func (b *BoundedBacktracker) IsMatch(haystack []byte) bool
- func (b *BoundedBacktracker) IsMatchAnchored(haystack []byte) bool
- func (b *BoundedBacktracker) IsMatchAnchoredWithState(haystack []byte, state *BacktrackerState) bool
- func (b *BoundedBacktracker) IsMatchWithState(haystack []byte, state *BacktrackerState) bool
- func (b *BoundedBacktracker) MaxVisitedSize() int
- func (b *BoundedBacktracker) NumStates() int
- func (b *BoundedBacktracker) Search(haystack []byte) (int, int, bool)
- func (b *BoundedBacktracker) SearchAt(haystack []byte, at int) (int, int, bool)
- func (b *BoundedBacktracker) SearchAtWithState(haystack []byte, at int, state *BacktrackerState) (int, int, bool)
- func (b *BoundedBacktracker) SearchWithState(haystack []byte, state *BacktrackerState) (int, int, bool)
- func (b *BoundedBacktracker) SetLongest(longest bool)
- type BranchDispatcher
- type BuildError
- type BuildOption
- type Builder
- func (b *Builder) AddByteRange(lo, hi byte, next StateID) StateID
- func (b *Builder) AddCapture(captureIndex uint32, isStart bool, next StateID) StateID
- func (b *Builder) AddEpsilon(next StateID) StateID
- func (b *Builder) AddFail() StateID
- func (b *Builder) AddLook(look Look, next StateID) StateID
- func (b *Builder) AddMatch() StateID
- func (b *Builder) AddQuantifierSplit(left, right StateID) StateID
- func (b *Builder) AddRuneAny(next StateID) StateID
- func (b *Builder) AddRuneAnyNotNL(next StateID) StateID
- func (b *Builder) AddSparse(transitions []Transition) StateID
- func (b *Builder) AddSplit(left, right StateID) StateID
- func (b *Builder) Build(opts ...BuildOption) (*NFA, error)
- func (b *Builder) Patch(stateID, target StateID) error
- func (b *Builder) PatchSplit(stateID StateID, left, right StateID) error
- func (b *Builder) SetStart(start StateID)deprecated
- func (b *Builder) SetStarts(anchored, unanchored StateID)
- func (b *Builder) States() int
- func (b *Builder) Validate() error
- type ByteClassSet
- type ByteClasses
- type CharClassSearcher
- func (s *CharClassSearcher) CanHandle(_ int) bool
- func (s *CharClassSearcher) Count(haystack []byte) int
- func (s *CharClassSearcher) FindAllIndices(haystack []byte, results [][2]int) [][2]int
- func (s *CharClassSearcher) IsMatch(haystack []byte) bool
- func (s *CharClassSearcher) Search(haystack []byte) (int, int, bool)
- func (s *CharClassSearcher) SearchAt(haystack []byte, at int) (int, int, bool)
- type CompileError
- type Compiler
- type CompilerConfig
- type CompositeSearcher
- type CompositeSequenceDFA
- type FirstByteSet
- type Look
- type Match
- type MatchWithCaptures
- type NFA
- func (n *NFA) ByteClasses() *ByteClasses
- func (n *NFA) CaptureCount() int
- func (n *NFA) IsAlwaysAnchored() bool
- func (n *NFA) IsAnchored() bool
- func (n *NFA) IsMatch(id StateID) bool
- func (n *NFA) IsUTF8() bool
- func (n *NFA) Iter() *StateIter
- func (n *NFA) PatternCount() int
- func (n *NFA) Start() StateIDdeprecated
- func (n *NFA) StartAnchored() StateID
- func (n *NFA) StartUnanchored() StateID
- func (n *NFA) State(id StateID) *State
- func (n *NFA) States() int
- func (n *NFA) String() string
- func (n *NFA) SubexpNames() []string
- type PikeVM
- func (p *PikeVM) InitState(state *PikeVMState)
- func (p *PikeVM) IsMatch(haystack []byte) bool
- func (p *PikeVM) NumStates() int
- func (p *PikeVM) Search(haystack []byte) (int, int, bool)
- func (p *PikeVM) SearchAll(haystack []byte) []Match
- func (p *PikeVM) SearchAt(haystack []byte, at int) (int, int, bool)
- func (p *PikeVM) SearchBetween(haystack []byte, startAt, maxEnd int) (int, int, bool)
- func (p *PikeVM) SearchWithCaptures(haystack []byte) *MatchWithCaptures
- func (p *PikeVM) SearchWithCapturesAt(haystack []byte, at int) *MatchWithCaptures
- func (p *PikeVM) SetLongest(longest bool)
- type PikeVMState
- type State
- func (s *State) ByteRange() (lo, hi byte, next StateID)
- func (s *State) Capture() (index uint32, isStart bool, next StateID)
- func (s *State) Epsilon() StateID
- func (s *State) ID() StateID
- func (s *State) IsMatch() bool
- func (s *State) IsQuantifierSplit() bool
- func (s *State) Kind() StateKind
- func (s *State) Look() (Look, StateID)
- func (s *State) RuneAny() StateID
- func (s *State) RuneAnyNotNL() StateID
- func (s *State) Split() (left, right StateID)
- func (s *State) String() string
- func (s *State) Transitions() []Transition
- type StateID
- type StateIter
- type StateKind
- type Transition
Constants ¶
This section is empty.
Variables ¶
var ( // ErrInvalidState indicates an invalid NFA state ID was encountered ErrInvalidState = errors.New("invalid NFA state") // ErrInvalidPattern indicates the regex pattern is invalid or unsupported ErrInvalidPattern = errors.New("invalid regex pattern") // ErrTooComplex indicates the pattern is too complex to compile ErrTooComplex = errors.New("pattern too complex") // ErrCompilation indicates a general NFA compilation failure ErrCompilation = errors.New("NFA compilation failed") // ErrInvalidConfig indicates invalid configuration was provided ErrInvalidConfig = errors.New("invalid NFA configuration") // ErrNoMatch indicates no match was found (not an error, used internally) ErrNoMatch = errors.New("no match found") )
Common NFA errors
Functions ¶
func ContainsDot ¶ added in v0.11.0
ContainsDot checks if the regex pattern contains '.' (any char or any char except newline). When true, the pattern will benefit from ASCII-only NFA compilation for ASCII-only inputs.
This is used for ASCII runtime detection optimization (V11-002): patterns with '.' create ~28 NFA states for UTF-8 handling, but only 1-2 states in ASCII mode. Runtime detection allows using the faster ASCII NFA when input is ASCII.
func ExtractCharClassRanges ¶ added in v0.8.21
ExtractCharClassRanges extracts byte ranges from a simple char_class+ pattern AST. Returns nil if the pattern is not a simple char_class+ pattern.
Simple char_class+ patterns are:
- [a-z]+, [A-Z]+, [0-9]+ (single range)
- [\w]+, [\d]+, [\s]+ (predefined classes)
- [a-zA-Z0-9_]+ (multiple ranges)
NOT supported:
- char_class* patterns (zero-width matches not handled by CharClassSearcher)
- Patterns with anchors (^, $)
- Patterns with alternation outside char class
- Patterns with concatenation (abc[\w]+)
- Unicode char classes (need rune handling)
func HasImpossibleEndAnchor ¶ added in v0.10.9
HasImpossibleEndAnchor checks if the pattern contains an end anchor ($, \z) that is NOT at the end of the pattern. Such patterns like `$00` can never match because nothing can follow the end-of-string assertion.
Examples:
- `$00` → true ($ followed by "00" - impossible)
- `00$` → false ($ at end - valid end-anchored pattern)
- `a$b` → true ($ followed by "b" - impossible)
- `(a$)` → false ($ at end of group - valid)
func IsBranchDispatchPattern ¶ added in v0.10.4
IsBranchDispatchPattern checks if pattern is suitable for branch dispatch. Pattern must be start-anchored alternation with distinct first bytes per branch.
func IsCompositeCharClassPattern ¶ added in v0.10.4
IsCompositeCharClassPattern returns true if the pattern is a valid composite char class pattern. A composite pattern is a concatenation of 2+ quantified character classes like [a-zA-Z]+[0-9]+.
Requirements:
- Must be OpConcat
- Each sub-pattern must be a quantified char class (OpPlus, OpStar, OpQuest, OpRepeat)
- At least 2 parts
- No anchors, no captures, no alternations
func IsCompositeSequenceDFAPattern ¶ added in v0.10.6
IsCompositeSequenceDFAPattern checks if a pattern is suitable for CompositeSequenceDFA.
func IsPatternEndAnchored ¶ added in v0.4.0
IsPatternEndAnchored checks if a pattern is inherently anchored at end (ends with \z).
A pattern is end-anchored if it ends with:
- OpEndText (\z or non-multiline $) - only matches at EOF
Note: OpEndLine (multiline $ with (?m)) is NOT considered end-anchored because it can match at multiple positions (before each \n and at EOF). Using reverse search for multiline $ would miss matches before \n characters.
This is used to select the ReverseAnchored strategy which searches backward from the end of haystack for O(m) instead of O(n*m) performance.
IMPORTANT: This also checks for internal end anchors (like in `(a$)b$`). If there's an end anchor that's NOT at the very end, ReverseAnchored won't work.
func IsPatternStartAnchored ¶ added in v0.8.11
IsPatternStartAnchored checks if ANY branch of the pattern starts with ^ or \A. This is used to prevent UseReverseAnchored strategy selection for patterns like `^a?$|^b?$` where the start anchor constrains matching positions.
Unlike IsPatternEndAnchored which requires ALL branches to be end-anchored, this function returns true if ANY branch has a start anchor, because reverse search cannot properly handle partial start anchoring in alternations.
func IsSimpleCharClassPlus ¶ added in v0.8.21
IsSimpleCharClassPlus returns true if the pattern is a simple char_class+ pattern that can use CharClassSearcher for optimized matching.
func PatternHasUTF8Dependence ¶ added in v0.11.0
PatternHasUTF8Dependence checks if the pattern has constructs that depend on UTF-8 handling beyond '.' (such as Unicode character classes, word boundaries, etc).
When this returns true, ASCII optimization may not be valid for all cases. Currently checks for:
- Unicode character classes (\p{...}, \P{...})
- Word boundaries (\b, \B) - these depend on Unicode word chars in Go regexp
Note: This is conservative - patterns that ONLY use '.' and ASCII literals can safely use ASCII optimization.
Types ¶
type BacktrackerState ¶ added in v0.10.4
type BacktrackerState struct {
// Visited stores generation numbers for (state, position) pairs.
// Layout: Visited[state * (InputLen+1) + pos] = generation when visited.
// Using generation counter enables O(1) reset instead of O(n) clearing.
Visited []uint32
// Generation is incremented for each new search attempt.
// A position is considered visited if Visited[idx] == Generation.
Generation uint32
// InputLen is cached for index calculations
InputLen int
// Longest enables leftmost-longest match semantics (POSIX/AWK compatibility).
// When true, explores all branches to find the longest match instead of
// returning on the first match found.
Longest bool
}
BacktrackerState holds mutable per-search state for BoundedBacktracker. This struct should be pooled (via sync.Pool) for concurrent usage. Each goroutine must use its own BacktrackerState instance.
func NewBacktrackerState ¶ added in v0.10.4
func NewBacktrackerState() *BacktrackerState
NewBacktrackerState creates a new mutable state for use with BoundedBacktracker. This should be pooled via sync.Pool for concurrent usage.
type BoundedBacktracker ¶ added in v0.8.17
type BoundedBacktracker struct {
// contains filtered or unexported fields
}
BoundedBacktracker implements a bounded backtracking regex matcher. It uses a generation-based visited tracking for (state, position) pairs, providing O(1) reset between search attempts instead of O(n) clearing.
This engine is selected when:
- len(haystack) * nfa.States() <= maxVisitedSize (default 256KB)
- No prefilter is available (no good literals)
- Pattern doesn't benefit from DFA (simple character classes)
BoundedBacktracker is 2-5x faster than PikeVM for patterns like \d+, \w+, [a-z]+.
Thread safety: BoundedBacktracker config is immutable after creation. Use BacktrackerState for per-search mutable state to enable concurrent usage.
func NewBoundedBacktracker ¶ added in v0.8.17
func NewBoundedBacktracker(nfa *NFA) *BoundedBacktracker
NewBoundedBacktracker creates a new bounded backtracker for the given NFA.
func (*BoundedBacktracker) CanHandle ¶ added in v0.8.17
func (b *BoundedBacktracker) CanHandle(haystackLen int) bool
CanHandle returns true if this engine can handle the given input size. Returns false if the visited array would exceed maxVisitedSize entries.
func (*BoundedBacktracker) IsMatch ¶ added in v0.8.17
func (b *BoundedBacktracker) IsMatch(haystack []byte) bool
IsMatch returns true if the pattern matches anywhere in the haystack. This is optimized for boolean-only matching. This method uses internal state and is NOT thread-safe. For concurrent usage, use IsMatchWithState.
func (*BoundedBacktracker) IsMatchAnchored ¶ added in v0.8.17
func (b *BoundedBacktracker) IsMatchAnchored(haystack []byte) bool
IsMatchAnchored returns true if the pattern matches at the start of haystack. This method uses internal state and is NOT thread-safe.
func (*BoundedBacktracker) IsMatchAnchoredWithState ¶ added in v0.10.4
func (b *BoundedBacktracker) IsMatchAnchoredWithState(haystack []byte, state *BacktrackerState) bool
IsMatchAnchoredWithState returns true if the pattern matches at the start of haystack. This method uses external state and IS thread-safe when each goroutine uses its own state.
func (*BoundedBacktracker) IsMatchWithState ¶ added in v0.10.4
func (b *BoundedBacktracker) IsMatchWithState(haystack []byte, state *BacktrackerState) bool
IsMatchWithState returns true if the pattern matches anywhere in the haystack. This method uses external state and IS thread-safe when each goroutine uses its own state.
func (*BoundedBacktracker) MaxVisitedSize ¶ added in v0.10.4
func (b *BoundedBacktracker) MaxVisitedSize() int
MaxVisitedSize returns the maximum visited array size.
func (*BoundedBacktracker) NumStates ¶ added in v0.10.4
func (b *BoundedBacktracker) NumStates() int
NumStates returns the number of NFA states (for state allocation).
func (*BoundedBacktracker) Search ¶ added in v0.8.17
func (b *BoundedBacktracker) Search(haystack []byte) (int, int, bool)
Search finds the first match in the haystack. Returns (start, end, true) if found, (-1, -1, false) otherwise. This method uses internal state and is NOT thread-safe.
func (*BoundedBacktracker) SearchAt ¶ added in v0.8.21
SearchAt finds the first match starting from position 'at'. Returns (start, end, true) if found, (-1, -1, false) otherwise. This is used by FindAll* operations for efficient iteration. In longest mode, finds the longest match at the leftmost position. This method uses internal state and is NOT thread-safe.
func (*BoundedBacktracker) SearchAtWithState ¶ added in v0.10.4
func (b *BoundedBacktracker) SearchAtWithState(haystack []byte, at int, state *BacktrackerState) (int, int, bool)
SearchAtWithState finds the first match starting from position 'at'. Returns (start, end, true) if found, (-1, -1, false) otherwise. This method uses external state and IS thread-safe when each goroutine uses its own state.
func (*BoundedBacktracker) SearchWithState ¶ added in v0.10.4
func (b *BoundedBacktracker) SearchWithState(haystack []byte, state *BacktrackerState) (int, int, bool)
SearchWithState finds the first match in the haystack. Returns (start, end, true) if found, (-1, -1, false) otherwise. This method uses external state and IS thread-safe when each goroutine uses its own state.
func (*BoundedBacktracker) SetLongest ¶ added in v0.8.24
func (b *BoundedBacktracker) SetLongest(longest bool)
SetLongest enables or disables leftmost-longest match semantics on internal state. When enabled, the backtracker finds the longest match at each position instead of returning on the first match found. Note: For thread-safe usage, set Longest directly on BacktrackerState.
type BranchDispatcher ¶ added in v0.10.4
type BranchDispatcher struct {
// contains filtered or unexported fields
}
BranchDispatcher provides O(1) branch selection for anchored alternations. For patterns like ^(\d+|UUID|hex32), it dispatches directly to the matching branch based on the first byte, avoiding the need to try all branches.
This is only applicable when:
- Pattern is start-anchored (^)
- Top-level is an alternation (a|b|c)
- Each branch has distinct first bytes (no overlap)
Performance: O(1) branch selection vs O(branches) for naive approach. For 3-branch pattern: ~3x faster on match, ~10x faster on no-match.
func NewBranchDispatcher ¶ added in v0.10.4
func NewBranchDispatcher(re *syntax.Regexp) *BranchDispatcher
NewBranchDispatcher creates a dispatcher for an anchored alternation. Returns nil if the pattern is not suitable for branch dispatch.
func (*BranchDispatcher) IsMatch ¶ added in v0.10.4
func (d *BranchDispatcher) IsMatch(haystack []byte) bool
IsMatch returns true if the haystack matches the pattern. Only checks at position 0 (for anchored patterns).
type BuildError ¶
BuildError represents an error during NFA construction via the Builder API
type BuildOption ¶
type BuildOption func(*NFA)
BuildOption is a functional option for configuring the built NFA
func WithAnchored ¶
func WithAnchored(anchored bool) BuildOption
WithAnchored sets whether the NFA requires anchored matching
func WithCaptureCount ¶ added in v0.2.0
func WithCaptureCount(count int) BuildOption
WithCaptureCount sets the number of capture groups in the NFA
func WithCaptureNames ¶ added in v0.5.0
func WithCaptureNames(names []string) BuildOption
WithCaptureNames sets the names of capture groups in the NFA. The slice should have length equal to captureCount. Index 0 should be "" (entire match), named groups have their names, unnamed groups are "".
func WithPatternCount ¶
func WithPatternCount(count int) BuildOption
WithPatternCount sets the number of patterns in the NFA
func WithUTF8 ¶
func WithUTF8(utf8 bool) BuildOption
WithUTF8 sets whether the NFA respects UTF-8 boundaries
type Builder ¶
type Builder struct {
// contains filtered or unexported fields
}
Builder constructs NFAs incrementally using a low-level API. This provides full control over NFA construction and is used by the Compiler.
func NewBuilder ¶
func NewBuilder() *Builder
NewBuilder creates a new NFA builder with default capacity
func NewBuilderWithCapacity ¶
NewBuilderWithCapacity creates a new NFA builder with specified initial capacity
func (*Builder) AddByteRange ¶
AddByteRange adds a state that transitions on a single byte or byte range [lo, hi]. For a single byte, set lo == hi.
func (*Builder) AddCapture ¶ added in v0.2.0
AddCapture adds a capture boundary state. captureIndex is the capture group number (1-based for explicit groups, 0 for entire match). isStart is true for opening boundary '(', false for closing ')'. next is the state to transition to after recording the capture position.
func (*Builder) AddEpsilon ¶
AddEpsilon adds a state with a single epsilon transition (no input consumed)
func (*Builder) AddLook ¶ added in v0.8.4
AddLook adds a zero-width assertion state (look-around). look is the assertion type (start/end of text/line). next is the state to transition to if the assertion succeeds.
func (*Builder) AddQuantifierSplit ¶ added in v0.8.9
AddQuantifierSplit adds a state with epsilon transitions for quantifiers (*, +, ?, {n,m}). Unlike alternation splits, quantifier splits don't affect thread priority. Left branch is the "continue/repeat" path, right branch is the "exit" path. This distinction is crucial for correct leftmost-first matching semantics.
func (*Builder) AddRuneAny ¶ added in v0.8.9
AddRuneAny adds a state that matches any Unicode codepoint (including newlines). This is used for (?s). (dot with DOTALL flag). The state consumes 1-4 bytes (UTF-8 encoded rune) and transitions to next.
func (*Builder) AddRuneAnyNotNL ¶ added in v0.8.9
AddRuneAnyNotNL adds a state that matches any Unicode codepoint except newline. This is used for the default . (dot) behavior. The state consumes 1-4 bytes (UTF-8 encoded rune) and transitions to next.
func (*Builder) AddSparse ¶
func (b *Builder) AddSparse(transitions []Transition) StateID
AddSparse adds a state with multiple byte range transitions (character class). The transitions slice is copied to avoid aliasing issues.
func (*Builder) AddSplit ¶
AddSplit adds a state with epsilon transitions to two states (alternation). This is used for alternation (a|b). For quantifiers, use AddQuantifierSplit.
func (*Builder) Build ¶
func (b *Builder) Build(opts ...BuildOption) (*NFA, error)
Build finalizes and returns the constructed NFA. Options can be provided to set anchored/utf8 modes and pattern count.
func (*Builder) Patch ¶
Patch updates a state's target. This is used during compilation to handle forward references (e.g., loops, alternations). This only works for states with a single 'next' target (ByteRange, Epsilon).
func (*Builder) PatchSplit ¶
PatchSplit updates the left or right target of a Split state
type ByteClassSet ¶ added in v0.4.0
type ByteClassSet struct {
// contains filtered or unexported fields
}
ByteClassSet tracks byte boundaries during NFA construction.
During NFA compilation, we track which bytes are "boundary" bytes where equivalence classes change. For example, if pattern is [a-z], then bytes 'a'-1 and 'z' are boundaries.
Algorithm:
- For each ByteRange [lo, hi] in NFA transitions: - If lo > 0: mark lo-1 as boundary - Mark hi as boundary
- Convert boundaries to classes by incrementing class at each boundary
func NewByteClassSet ¶ added in v0.4.0
func NewByteClassSet() *ByteClassSet
NewByteClassSet creates an empty ByteClassSet with no boundaries.
func (*ByteClassSet) ByteClasses ¶ added in v0.4.0
func (bcs *ByteClassSet) ByteClasses() ByteClasses
ByteClasses converts the boundary set into a ByteClasses lookup table.
Algorithm: Walk through all 256 bytes, incrementing the class number each time we encounter a boundary byte.
func (*ByteClassSet) Merge ¶ added in v0.4.0
func (bcs *ByteClassSet) Merge(other *ByteClassSet)
Merge combines another ByteClassSet into this one. Used when building composite patterns.
func (*ByteClassSet) SetByte ¶ added in v0.4.0
func (bcs *ByteClassSet) SetByte(b byte)
SetByte marks a single byte as having a distinct transition. Equivalent to SetRange(b, b).
func (*ByteClassSet) SetRange ¶ added in v0.4.0
func (bcs *ByteClassSet) SetRange(start, end byte)
SetRange marks a byte range [start, end] as having distinct transitions. This sets boundary bits at start-1 and end.
type ByteClasses ¶ added in v0.4.0
type ByteClasses struct {
// contains filtered or unexported fields
}
ByteClasses maps each byte value to its equivalence class.
ByteClasses is an alphabet reduction technique that groups bytes into equivalence classes based on how the regex treats them. Instead of maintaining 256 transitions per DFA state, transitions are reduced to N classes (typically 4-64), dramatically reducing memory footprint.
Key principle: Two bytes belong to the same equivalence class if they will never cause different transitions in any DFA state for the given regex.
Example for pattern [a-z]+:
- Class 0: bytes 0x00-0x60 (before 'a')
- Class 1: bytes 0x61-0x7a ('a' to 'z')
- Class 2: bytes 0x7b-0xff (after 'z')
Memory impact:
- Without ByteClasses: 256 transitions per state
- With ByteClasses: ~8-16 transitions per state (15-30x reduction)
func NewByteClasses ¶ added in v0.4.0
func NewByteClasses() ByteClasses
NewByteClasses creates a new ByteClasses where all bytes are in class 0.
func SingletonByteClasses ¶ added in v0.4.0
func SingletonByteClasses() ByteClasses
SingletonByteClasses creates ByteClasses where each byte is its own class. This is equivalent to no alphabet reduction (256 classes).
func (*ByteClasses) AlphabetLen ¶ added in v0.4.0
func (bc *ByteClasses) AlphabetLen() int
AlphabetLen returns the total number of equivalence classes. This is the number of distinct class values + 1 for potential EOI sentinel.
func (*ByteClasses) Elements ¶ added in v0.4.0
func (bc *ByteClasses) Elements(class byte) []byte
Elements returns all bytes that belong to the given equivalence class.
func (*ByteClasses) Get ¶ added in v0.4.0
func (bc *ByteClasses) Get(b byte) byte
Get returns the equivalence class for the given byte. This is an O(1) lookup.
func (*ByteClasses) IsEmpty ¶ added in v0.4.0
func (bc *ByteClasses) IsEmpty() bool
IsEmpty returns true if all bytes are in the same equivalence class (class 0). This typically means the regex matches any byte or no bytes at all.
func (*ByteClasses) IsSingleton ¶ added in v0.4.0
func (bc *ByteClasses) IsSingleton() bool
IsSingleton returns true if each byte is its own equivalence class. This means no alphabet reduction is possible.
func (*ByteClasses) Representatives ¶ added in v0.4.0
func (bc *ByteClasses) Representatives() []byte
Representatives returns a slice of representative bytes, one for each class. Each representative can be used to compute transitions for all bytes in that class.
type CharClassSearcher ¶ added in v0.8.21
type CharClassSearcher struct {
// contains filtered or unexported fields
}
CharClassSearcher provides optimized search for simple character class patterns. For patterns like [\w]+, [a-z]+, \d+ where:
- Pattern is just a repeated character class (no alternation, no anchors)
- Character class can be represented as byte ranges
This searcher uses a 256-byte lookup table for O(1) membership test, avoiding the overhead of recursive backtracking.
Performance: 3-5x faster than BoundedBacktracker for char_class patterns.
Note: SIMD optimization was evaluated but found to be slower for char_class patterns because matches are frequent (30-50% of positions) and short. The scalar lookup table approach is optimal for this use case. For large-scale char_class search, consider using Lazy DFA instead.
func NewCharClassSearcher ¶ added in v0.8.21
func NewCharClassSearcher(ranges [][2]byte, minMatch int) *CharClassSearcher
NewCharClassSearcher creates a searcher from byte ranges. ranges is a list of [lo, hi] pairs where bytes in [lo, hi] match. minMatch is 1 for + quantifier, 0 for * quantifier.
func NewCharClassSearcherFromNFA ¶ added in v0.8.21
func NewCharClassSearcherFromNFA(n *NFA) *CharClassSearcher
NewCharClassSearcherFromNFA extracts byte ranges from an NFA and creates a searcher. Returns nil if the NFA is not a simple char_class+ pattern.
func (*CharClassSearcher) CanHandle ¶ added in v0.8.21
func (s *CharClassSearcher) CanHandle(_ int) bool
CanHandle returns true - CharClassSearcher can handle any input size.
func (*CharClassSearcher) Count ¶ added in v0.9.4
func (s *CharClassSearcher) Count(haystack []byte) int
Count returns the number of non-overlapping matches using single-pass state machine. This is faster than len(FindAllIndices()) because it doesn't allocate a results slice.
func (*CharClassSearcher) FindAllIndices ¶ added in v0.9.4
func (s *CharClassSearcher) FindAllIndices(haystack []byte, results [][2]int) [][2]int
FindAllIndices finds all non-overlapping matches using a single-pass state machine. This is significantly faster than repeated SearchAt calls because:
- No per-match function call overhead
- Better CPU branch prediction (consistent state transitions)
- Single pass through input (cache-friendly)
Returns slice of [start, end] pairs. If results slice is provided and has sufficient capacity, it will be reused to avoid allocation.
This implements the Rust regex approach: streaming state machine with SEARCHING/MATCHING states instead of separate find-start/find-end loops.
func (*CharClassSearcher) IsMatch ¶ added in v0.8.21
func (s *CharClassSearcher) IsMatch(haystack []byte) bool
IsMatch returns true if pattern matches anywhere in haystack.
type CompileError ¶
CompileError wraps compilation errors with additional context
func (*CompileError) Error ¶
func (e *CompileError) Error() string
Error implements the error interface
func (*CompileError) Unwrap ¶
func (e *CompileError) Unwrap() error
Unwrap returns the underlying error
type Compiler ¶
type Compiler struct {
// contains filtered or unexported fields
}
Compiler compiles regexp/syntax.Regexp patterns into Thompson NFAs
func NewCompiler ¶
func NewCompiler(config CompilerConfig) *Compiler
NewCompiler creates a new NFA compiler with the given configuration
func NewDefaultCompiler ¶
func NewDefaultCompiler() *Compiler
NewDefaultCompiler creates a new NFA compiler with default configuration
type CompilerConfig ¶
type CompilerConfig struct {
// UTF8 determines whether the NFA respects UTF-8 boundaries.
// When true, empty matches that split UTF-8 sequences are avoided.
UTF8 bool
// Anchored forces the pattern to match only at the start of input
Anchored bool
// DotNewline determines whether '.' matches '\n'
DotNewline bool
// ASCIIOnly when true, compiles '.' to match only ASCII bytes (0x00-0x7F).
// This dramatically reduces NFA state count (1 state vs ~28 states) and
// improves performance for patterns with '.' when input is known to be ASCII.
//
// When false (default), '.' compiles to match any valid UTF-8 codepoint,
// requiring ~28 NFA states to handle all valid UTF-8 byte sequences.
//
// This is used for ASCII runtime detection optimization (V11-002):
// compile both ASCII and UTF-8 NFAs, select at runtime based on input.
ASCIIOnly bool
// MaxRecursionDepth limits recursion during compilation to prevent stack overflow
// Default: 100
MaxRecursionDepth int
}
CompilerConfig configures NFA compilation behavior
func DefaultCompilerConfig ¶
func DefaultCompilerConfig() CompilerConfig
DefaultCompilerConfig returns a compiler configuration with sensible defaults
type CompositeSearcher ¶ added in v0.10.4
type CompositeSearcher struct {
// contains filtered or unexported fields
}
CompositeSearcher is a specialized searcher for concatenated character class patterns. For patterns like `[a-zA-Z]+[0-9]+`, it uses sequential lookup tables to achieve 5-6x speedup over BoundedBacktracker.
Algorithm:
- Each char class part has a [256]bool membership table for O(1) lookup
- Greedy matching: consume as many characters as possible for each part
- Backtrack if a part doesn't meet its minimum match requirement
Example patterns:
- [a-zA-Z]+[0-9]+ → letters followed by digits
- \d+\s+\w+ → digits, whitespace, word chars
- [a-z]+[A-Z]+ → lowercase then uppercase
Thread safety: NOT thread-safe. For concurrent usage, each goroutine needs its own instance.
Reference: https://github.com/coregx/coregex/issues/72
func NewCompositeSearcher ¶ added in v0.10.4
func NewCompositeSearcher(re *syntax.Regexp) *CompositeSearcher
NewCompositeSearcher creates a CompositeSearcher from a syntax.Regexp. Returns nil if the pattern is not a valid composite char class pattern.
Valid patterns are concatenations of character classes with +, *, ?, or {n,m} quantifiers.
func (*CompositeSearcher) IsMatch ¶ added in v0.10.4
func (c *CompositeSearcher) IsMatch(haystack []byte) bool
IsMatch returns true if the haystack contains a match.
type CompositeSequenceDFA ¶ added in v0.10.6
type CompositeSequenceDFA struct {
// contains filtered or unexported fields
}
CompositeSequenceDFA is a specialized DFA for composite character class patterns. It uses NFA subset construction to handle overlapping character classes correctly.
For pattern `\w+[0-9]+` with overlapping chars (digits are in both \w and [0-9]):
- Uses byte classes to reduce transitions
- Each DFA state represents a set of NFA positions (which parts could be active)
- Handles overlap by tracking multiple possible parse positions simultaneously
Thread safety: NOT thread-safe. For concurrent usage, each goroutine needs its own instance.
func NewCompositeSequenceDFA ¶ added in v0.10.6
func NewCompositeSequenceDFA(re *syntax.Regexp) *CompositeSequenceDFA
NewCompositeSequenceDFA creates a specialized DFA for composite patterns. Returns nil if the pattern is not suitable for this DFA.
func (*CompositeSequenceDFA) IsMatch ¶ added in v0.10.6
func (d *CompositeSequenceDFA) IsMatch(haystack []byte) bool
IsMatch returns true if the haystack contains a match.
type FirstByteSet ¶ added in v0.10.4
type FirstByteSet struct {
// contains filtered or unexported fields
}
FirstByteSet represents the set of bytes that can start a match. Used for O(1) early rejection of non-matching inputs.
func ExtractFirstBytes ¶ added in v0.10.4
func ExtractFirstBytes(re *syntax.Regexp) *FirstByteSet
ExtractFirstBytes extracts the set of possible first bytes from a pattern. Returns nil if the pattern is too complex or can match empty string.
This is used for O(1) early rejection of non-matching inputs in anchored patterns. For example, for ^(\d+|UUID|hex32):
- Valid first bytes: 0-9, 'U', 'h'
- Any other first byte → immediate rejection
func (*FirstByteSet) Contains ¶ added in v0.10.4
func (f *FirstByteSet) Contains(b byte) bool
Contains returns true if b can be the first byte of a match.
func (*FirstByteSet) Count ¶ added in v0.10.4
func (f *FirstByteSet) Count() int
Count returns the number of possible first bytes.
func (*FirstByteSet) IsComplete ¶ added in v0.10.4
func (f *FirstByteSet) IsComplete() bool
IsComplete returns true if this set is exhaustive.
func (*FirstByteSet) IsUseful ¶ added in v0.10.4
func (f *FirstByteSet) IsUseful() bool
IsUseful returns true if this prefilter can reject inputs. Returns false if:
- All 256 bytes are valid (e.g., for .* patterns)
- No bytes are valid (e.g., for ^ patterns that match empty at position 0)
- Set is incomplete (pattern may match starting with unknown bytes)
type Look ¶ added in v0.8.4
type Look uint8
Look represents a zero-width assertion type (look-around)
const ( // LookStartText matches the start of input (\A) LookStartText Look = iota // LookEndText matches the end of input (\z) LookEndText // LookStartLine matches the start of a line (^) // Matches at position 0 or after a newline LookStartLine // LookEndLine matches the end of a line ($) // Matches at end of input or before a newline LookEndLine // LookWordBoundary matches a word boundary (\b) // Matches where is_word_char(prev) != is_word_char(curr) // Word chars are ASCII: [a-zA-Z0-9_] LookWordBoundary // LookNoWordBoundary matches a non-word boundary (\B) // Matches where is_word_char(prev) == is_word_char(curr) LookNoWordBoundary )
type MatchWithCaptures ¶ added in v0.2.0
type MatchWithCaptures struct {
Start int
End int
Captures [][]int // Captures[i] = [start, end] for group i, or nil if not captured
}
MatchWithCaptures represents a match including capture group positions. Captures is a slice where Captures[i] is [start, end] for group i. Group 0 is the entire match.
type NFA ¶
type NFA struct {
// contains filtered or unexported fields
}
NFA represents a compiled Thompson NFA. It is the result of compiling a regexp/syntax.Regexp pattern.
func Reverse ¶ added in v0.4.0
Reverse builds a reverse NFA from the given forward NFA.
In a reverse NFA:
- Start and match states are swapped
- All transitions are reversed (A->B becomes B->A)
- The automaton matches patterns backward from the end
This is used for reverse searching, which enables efficient matching of patterns with $ anchor by searching backward from the end of the haystack.
Algorithm (Two-Pass):
Pass 1: Collect all reverse edges and allocate state IDs Pass 2: Build actual states with correct transition targets
Note: Reverse NFA does NOT support capture groups. Captures require forward search to maintain correct positions.
Example:
Forward NFA for "abc": start -> a -> b -> c -> match Reverse NFA: start(from match) -> c -> b -> a -> match(from start)
func ReverseAnchored ¶ added in v0.4.0
ReverseAnchored builds a reverse NFA that is anchored at start. This is used for patterns ending with $ anchor, where the reverse NFA should only match starting at position 0 of the reversed haystack.
This is equivalent to Reverse() followed by marking the result as anchored.
func (*NFA) ByteClasses ¶ added in v0.4.0
func (n *NFA) ByteClasses() *ByteClasses
ByteClasses returns the byte equivalence classes for this NFA. Used by DFA to reduce transition table size from 256 to ~8-16 entries.
func (*NFA) CaptureCount ¶ added in v0.2.0
CaptureCount returns the number of capture groups in the NFA. Group 0 is the entire match, groups 1+ are explicit captures. For a pattern like "(a)(b)", this returns 3 (entire match + 2 groups).
func (*NFA) IsAlwaysAnchored ¶ added in v0.1.2
IsAlwaysAnchored returns true if anchored and unanchored starts are the same. This indicates the pattern is inherently anchored (has ^ prefix).
func (*NFA) IsAnchored ¶
IsAnchored returns true if the NFA requires anchored matching
func (*NFA) PatternCount ¶
PatternCount returns the number of patterns in the NFA
func (*NFA) StartAnchored ¶ added in v0.1.2
StartAnchored returns the start state for anchored searches
func (*NFA) StartUnanchored ¶ added in v0.1.2
StartUnanchored returns the start state for unanchored searches
func (*NFA) SubexpNames ¶ added in v0.5.0
SubexpNames returns the names of capture groups in the pattern. Index 0 is always "" (representing the entire match). Named groups return their names, unnamed groups return "".
Example:
pattern: `(?P<year>\d+)-(\d+)-(?P<day>\d+)` returns: ["", "year", "", "day"]
This matches stdlib regexp.Regexp.SubexpNames() behavior.
type PikeVM ¶
type PikeVM struct {
// contains filtered or unexported fields
}
PikeVM implements the Pike VM algorithm for NFA execution. It simulates the NFA by maintaining a set of active states and exploring all possible paths through the automaton.
The Pike VM is slower than DFA-based approaches but handles all regex features including backreferences (future) and capturing groups.
Thread safety: PikeVM configuration (nfa) is immutable after creation. For thread-safe concurrent usage, use *WithState methods with external PikeVMState. The legacy methods without state use internal state and are NOT thread-safe.
func (*PikeVM) InitState ¶ added in v0.10.4
func (p *PikeVM) InitState(state *PikeVMState)
InitState initializes this state for use with the given PikeVM. Must be called before using the state with *WithState methods.
func (*PikeVM) IsMatch ¶ added in v0.8.15
IsMatch returns true if the pattern matches anywhere in the haystack. This is optimized for boolean-only matching - it returns as soon as any match is found without computing exact match positions.
This is significantly faster than Search() when you only need to know if a match exists, not where it is.
func (*PikeVM) NumStates ¶ added in v0.10.4
NumStates returns the number of NFA states (for state allocation).
func (*PikeVM) Search ¶
Search finds the first match in the haystack. Returns (start, end, true) if a match is found, or (-1, -1, false) if not.
The search is unanchored by default (matches anywhere in haystack) unless the NFA was compiled with anchored mode.
This method uses internal state and is NOT thread-safe. For concurrent usage, use SearchWithState.
func (*PikeVM) SearchAll ¶
SearchAll finds all non-overlapping matches in the haystack. Returns a slice of matches in order of occurrence.
func (*PikeVM) SearchAt ¶ added in v0.8.6
SearchAt finds the first match in the haystack starting from position 'at'. Returns (start, end, true) if a match is found, or (-1, -1, false) if not.
This method is used by FindAll* operations to correctly handle anchors like ^. Unlike Search, it takes the FULL haystack and a starting position, so assertions like ^ correctly check against the original input start, not a sliced position.
func (*PikeVM) SearchBetween ¶ added in v0.9.2
SearchBetween finds the first match in the range [startAt, maxEnd] of haystack. This is an optimization for cases where we know the match ends at or before maxEnd (e.g., after DFA found the end position). It avoids scanning the full haystack.
Parameters:
- haystack: the input byte slice
- startAt: minimum position to start searching
- maxEnd: maximum position where match can end (exclusive for search, inclusive for match)
Returns (start, end, found) where start >= startAt and end <= maxEnd.
Performance: O(maxEnd - startAt) instead of O(len(haystack) - startAt).
func (*PikeVM) SearchWithCaptures ¶ added in v0.2.0
func (p *PikeVM) SearchWithCaptures(haystack []byte) *MatchWithCaptures
SearchWithCaptures finds the first match with capture group positions. Returns nil if no match is found.
func (*PikeVM) SearchWithCapturesAt ¶ added in v0.8.6
func (p *PikeVM) SearchWithCapturesAt(haystack []byte, at int) *MatchWithCaptures
SearchWithCapturesAt finds the first match with capture group positions, starting from position 'at' in the haystack. Returns nil if no match is found.
This method is used by FindAll* operations to correctly handle anchors like ^. Unlike SearchWithCaptures, it takes the FULL haystack and a starting position.
func (*PikeVM) SetLongest ¶ added in v0.8.12
SetLongest enables or disables leftmost-longest (POSIX) matching semantics. By default, uses leftmost-first (Perl) semantics where first alternative wins. When longest=true, the longest match at the same start position wins. Note: This modifies internal state. For thread-safe usage, set Longest directly on PikeVMState.
type PikeVMState ¶ added in v0.10.4
type PikeVMState struct {
// Thread queues for current and next generation
// Pre-allocated to avoid allocations during search
Queue []thread
NextQueue []thread
// Sparse set for tracking visited states in current generation
// This prevents processing the same state multiple times
Visited *sparse.SparseSet
// Longest enables leftmost-longest (POSIX) matching semantics.
// By default (false), uses leftmost-first (Perl) semantics where
// the first alternative wins. When true, the longest match wins.
Longest bool
}
PikeVMState holds mutable per-search state for PikeVM. This struct should be pooled (via sync.Pool) for concurrent usage. Each goroutine must use its own PikeVMState instance.
func NewPikeVMState ¶ added in v0.10.4
func NewPikeVMState() *PikeVMState
NewPikeVMState creates a new mutable state for use with PikeVM. The state must be initialized by calling PikeVM.InitState before use. This should be pooled via sync.Pool for concurrent usage.
type State ¶
type State struct {
// contains filtered or unexported fields
}
State represents a single NFA state with its transitions. The state's kind determines which fields are valid.
func (*State) ByteRange ¶
ByteRange returns the byte range for ByteRange states. Returns (0, 0, InvalidState) for non-ByteRange states.
func (*State) Capture ¶ added in v0.2.0
Capture returns capture group info for Capture states. Returns (group index, isStart, next state). isStart is true for opening boundary '(' and false for closing ')'.
func (*State) Epsilon ¶
Epsilon returns the target state for Epsilon states. Returns InvalidState for non-Epsilon states.
func (*State) IsQuantifierSplit ¶ added in v0.8.9
IsQuantifierSplit returns true if this Split state is for a quantifier (*, +, ?, {n,m}). Quantifier splits don't affect thread priority (greedy behavior is handled by DFS order). Alternation splits increment priority for the right branch (leftmost-first semantics).
func (*State) Look ¶ added in v0.8.4
Look returns the look-around assertion type and target state for Look states. Returns (0, InvalidState) for non-Look states.
func (*State) RuneAny ¶ added in v0.8.9
RuneAny returns the next state for RuneAny states. Returns InvalidState for non-RuneAny states.
func (*State) RuneAnyNotNL ¶ added in v0.8.9
RuneAnyNotNL returns the next state for RuneAnyNotNL states. Returns InvalidState for non-RuneAnyNotNL states.
func (*State) Split ¶
Split returns the two target states for Split states. Returns (InvalidState, InvalidState) for non-Split states.
func (*State) Transitions ¶
func (s *State) Transitions() []Transition
Transitions returns the list of transitions for Sparse states. Returns nil for non-Sparse states.
type StateID ¶
type StateID uint32
StateID uniquely identifies an NFA state. This is a 32-bit unsigned integer for compact representation.
type StateIter ¶
type StateIter struct {
// contains filtered or unexported fields
}
StateIter is an iterator over NFA states
type StateKind ¶
type StateKind uint8
StateKind identifies the type of NFA state and determines which transitions are valid.
const ( // StateMatch represents a match state (accepting state) StateMatch StateKind = iota // StateByteRange represents a single byte or byte range transition [lo, hi] StateByteRange // StateSparse represents multiple byte transitions (character class) // e.g., [a-zA-Z0-9] would use this with a list of byte ranges StateSparse // StateSplit represents an epsilon transition to 2 states (alternation) // Used for alternation (a|b) and optional patterns (a?) StateSplit // StateEpsilon represents an epsilon transition to 1 state // Used for sequencing without consuming input StateEpsilon // StateCapture represents a capture group boundary (future feature) // Not implemented in MVP but reserved for future use StateCapture // StateFail represents a dead state (no valid transitions) StateFail // StateLook represents a zero-width assertion (look-around) // Examples: ^, $, \A, \z (word boundaries \b, \B in future) StateLook // StateRuneAny matches any Unicode codepoint (including newlines) // This is the (?s). (dot with DOTALL flag) StateRuneAny // StateRuneAnyNotNL matches any Unicode codepoint except newline (\n) // This is the default . (dot) behavior StateRuneAnyNotNL )
type Transition ¶
type Transition struct {
Lo byte // inclusive lower bound
Hi byte // inclusive upper bound
Next StateID // target state
}
Transition represents a byte range and target state for sparse transitions. Used in character classes like [a-zA-Z0-9].