π File detail
utils/bash/bashParser.ts
π― Use case
This file lives under βutils/β, which covers cross-cutting helpers (shell, tempfiles, settings, messages, process input, β¦). On the API surface it exposes TsNode, ensureParserInitialized, getParserModule, and SHELL_KEYWORDS β mainly functions, hooks, or classes. What the file header says: Pure-TypeScript bash parser producing tree-sitter-bash-compatible ASTs. Downstream code in parser.ts, ast.ts, prefix.ts, ParsedCommand.ts walks this by field name. startIndex/endIndex are UTF-8 BYTE offsets (not JS string indices). Grammar reference: tree-sitter-bash. Validated a.
Generated from folder role, exports, dependency roots, and inline comments β not hand-reviewed for every path.
π§ Inline summary
Pure-TypeScript bash parser producing tree-sitter-bash-compatible ASTs. Downstream code in parser.ts, ast.ts, prefix.ts, ParsedCommand.ts walks this by field name. startIndex/endIndex are UTF-8 BYTE offsets (not JS string indices). Grammar reference: tree-sitter-bash. Validated against a 3449-input golden corpus generated from the WASM parser.
π€ Exports (heuristic)
TsNodeensureParserInitializedgetParserModuleSHELL_KEYWORDS
π₯οΈ Source preview
/**
* Pure-TypeScript bash parser producing tree-sitter-bash-compatible ASTs.
*
* Downstream code in parser.ts, ast.ts, prefix.ts, ParsedCommand.ts walks this
* by field name. startIndex/endIndex are UTF-8 BYTE offsets (not JS string
* indices).
*
* Grammar reference: tree-sitter-bash. Validated against a 3449-input golden
* corpus generated from the WASM parser.
*/
export type TsNode = {
type: string
text: string
startIndex: number
endIndex: number
children: TsNode[]
}
type ParserModule = {
parse: (source: string, timeoutMs?: number) => TsNode | null
}
/**
* 50ms wall-clock cap β bails out on pathological/adversarial input.
* Pass `Infinity` via `parse(src, Infinity)` to disable (e.g. correctness
* tests, where CI jitter would otherwise cause spurious null returns).
*/
const PARSE_TIMEOUT_MS = 50
/** Node budget cap β bails out before OOM on deeply nested input. */
const MAX_NODES = 50_000
const MODULE: ParserModule = { parse: parseSource }
const READY = Promise.resolve()
/** No-op: pure-TS parser needs no async init. Kept for API compatibility. */
export function ensureParserInitialized(): Promise<void> {
return READY
}
/** Always succeeds β pure-TS needs no init. */
export function getParserModule(): ParserModule | null {
return MODULE
}
// βββββββββββββββββββββββββββββ Tokenizer βββββββββββββββββββββββββββββ
type TokenType =
| 'WORD'
| 'NUMBER'
| 'OP'
| 'NEWLINE'
| 'COMMENT'
| 'DQUOTE'
| 'SQUOTE'
| 'ANSI_C'
| 'DOLLAR'
| 'DOLLAR_PAREN'
| 'DOLLAR_BRACE'
| 'DOLLAR_DPAREN'
| 'BACKTICK'
| 'LT_PAREN'
| 'GT_PAREN'
| 'EOF'
type Token = {
type: TokenType
value: string
/** UTF-8 byte offset of first char */
start: number
/** UTF-8 byte offset one past last char */
end: number
}
const SPECIAL_VARS = new Set(['?', '$', '@', '*', '#', '-', '!', '_'])
const DECL_KEYWORDS = new Set([
'export',
'declare',
'typeset',
'readonly',
'local',
])
export const SHELL_KEYWORDS = new Set([
'if',
'then',
'elif',
'else',
'fi',
'while',
'until',
'for',
'in',
'do',
'done',
'case',
'esac',
'function',
'select',
])
/**
* Lexer state. Tracks both JS-string index (for charAt) and UTF-8 byte offset
* (for TsNode positions). ASCII fast path: byte == char index. Non-ASCII
* advances byte count per-codepoint.
*/
type Lexer = {
src: string
len: number
/** JS string index */
i: number
/** UTF-8 byte offset */
b: number
/** Pending heredoc delimiters awaiting body scan at next newline */
heredocs: HeredocPending[]
/** Precomputed byte offset for each char index (lazy for non-ASCII) */
byteTable: Uint32Array | null
}
type HeredocPending = {
delim: string
stripTabs: boolean
quoted: boolean
/** Filled after body scan */
bodyStart: number
bodyEnd: number
endStart: number
endEnd: number
}
function makeLexer(src: string): Lexer {
return {
src,
len: src.length,
i: 0,
b: 0,
heredocs: [],
byteTable: null,
}
}
/** Advance one JS char, updating byte offset for UTF-8. */
function advance(L: Lexer): void {
const c = L.src.charCodeAt(L.i)
L.i++
if (c < 0x80) {
L.b++
} else if (c < 0x800) {
L.b += 2
} else if (c >= 0xd800 && c <= 0xdbff) {
// High surrogate β next char completes the pair, total 4 UTF-8 bytes
L.b += 4
L.i++
} else {
L.b += 3
}
}
function peek(L: Lexer, off = 0): string {
return L.i + off < L.len ? L.src[L.i + off]! : ''
}
function byteAt(L: Lexer, charIdx: number): number {
// Fast path: ASCII-only prefix means char idx == byte idx
if (L.byteTable) return L.byteTable[charIdx]!
// Build table on first non-trivial lookup
const t = new Uint32Array(L.len + 1)
let b = 0
let i = 0
while (i < L.len) {
t[i] = b
const c = L.src.charCodeAt(i)
if (c < 0x80) {
b++
i++
} else if (c < 0x800) {
b += 2
i++
} else if (c >= 0xd800 && c <= 0xdbff) {
t[i + 1] = b + 2
b += 4
i += 2
} else {
b += 3
i++
}
}
t[L.len] = b
L.byteTable = t
return t[charIdx]!
}
function isWordChar(c: string): boolean {
// Bash word chars: alphanumeric + various punctuation that doesn't start operators
return (
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c === '_' ||
c === '/' ||
c === '.' ||
c === '-' ||
c === '+' ||
c === ':' ||
c === '@' ||
c === '%' ||
c === ',' ||
c === '~' ||
c === '^' ||
c === '?' ||
c === '*' ||
c === '!' ||
c === '=' ||
c === '[' ||
c === ']'
)
}
function isWordStart(c: string): boolean {
return isWordChar(c) || c === '\\'
}
function isIdentStart(c: string): boolean {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c === '_'
}
function isIdentChar(c: string): boolean {
return isIdentStart(c) || (c >= '0' && c <= '9')
}
function isDigit(c: string): boolean {
return c >= '0' && c <= '9'
}
function isHexDigit(c: string): boolean {
return isDigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}
function isBaseDigit(c: string): boolean {
// Bash BASE#DIGITS: digits, letters, @ and _ (up to base 64)
return isIdentChar(c) || c === '@'
}
/**
* Unquoted heredoc delimiter chars. Bash accepts most non-metacharacters β
* not just identifiers. Stop at whitespace, redirects, pipe/list operators,
* and structural tokens. Allows !, -, ., +, etc. (e.g. <<!HEREDOC!).
*/
function isHeredocDelimChar(c: string): boolean {
return (
c !== '' &&
c !== ' ' &&
c !== '\t' &&
c !== '\n' &&
c !== '<' &&
c !== '>' &&
c !== '|' &&
c !== '&' &&
c !== ';' &&
c !== '(' &&
c !== ')' &&
c !== "'" &&
c !== '"' &&
c !== '`' &&
c !== '\\'
)
}
function skipBlanks(L: Lexer): void {
while (L.i < L.len) {
const c = L.src[L.i]!
if (c === ' ' || c === '\t' || c === '\r') {
// \r is whitespace per tree-sitter-bash extras /\s/ β handles CRLF inputs
advance(L)
} else if (c === '\\') {
const nx = L.src[L.i + 1]
if (nx === '\n' || (nx === '\r' && L.src[L.i + 2] === '\n')) {
// Line continuation β tree-sitter extras: /\\\r?\n/
advance(L)
advance(L)
if (nx === '\r') advance(L)
} else if (nx === ' ' || nx === '\t') {
// \<space> or \<tab> β tree-sitter's _whitespace is /\\?[ \t\v]+/
advance(L)
advance(L)
} else {
break
}
} else {
break
}
}
}
/**
* Scan next token. Context-sensitive: `cmd` mode treats [ as operator (test
* command start), `arg` mode treats [ as word char (glob/subscript).
*/
function nextToken(L: Lexer, ctx: 'cmd' | 'arg' = 'arg'): Token {
skipBlanks(L)
const start = L.b
if (L.i >= L.len) return { type: 'EOF', value: '', start, end: start }
const c = L.src[L.i]!
const c1 = peek(L, 1)
const c2 = peek(L, 2)
if (c === '\n') {
advance(L)
return { type: 'NEWLINE', value: '\n', start, end: L.b }
}
if (c === '#') {
const si = L.i
while (L.i < L.len && L.src[L.i] !== '\n') advance(L)
return {
type: 'COMMENT',
value: L.src.slice(si, L.i),
start,
end: L.b,
}
}
// Multi-char operators (longest match first)
if (c === '&' && c1 === '&') {
advance(L)
advance(L)
return { type: 'OP', value: '&&', start, end: L.b }
}
if (c === '|' && c1 === '|') {
advance(L)
advance(L)
return { type: 'OP', value: '||', start, end: L.b }
}
if (c === '|' && c1 === '&') {
advance(L)
advance(L)
return { type: 'OP', value: '|&', start, end: L.b }
}
if (c === ';' && c1 === ';' && c2 === '&') {
advance(L)
advance(L)
advance(L)
return { type: 'OP', value: ';;&', start, end: L.b }
}
if (c === ';' && c1 === ';') {
advance(L)
advance(L)
return { type: 'OP', value: ';;', start, end: L.b }
}
if (c === ';' && c1 === '&') {
advance(L)
advance(L)
return { type: 'OP', value: ';&', start, end: L.b }
}
if (c === '>' && c1 === '>') {
advance(L)
advance(L)
return { type: 'OP', value: '>>', start, end: L.b }
}
if (c === '>' && c1 === '&' && c2 === '-') {
advance(L)
advance(L)
advance(L)
return { type: 'OP', value: '>&-', start, end: L.b }
}
if (c === '>' && c1 === '&') {
advance(L)
advance(L)
return { type: 'OP', value: '>&', start, end: L.b }
}
if (c === '>' && c1 === '|') {
advance(L)
advance(L)
return { type: 'OP', value: '>|', start, end: L.b }
}
if (c === '&' && c1 === '>' && c2 === '>') {
advance(L)
advance(L)
advance(L)
return { type: 'OP', value: '&>>', start, end: L.b }
}
if (c === '&' && c1 === '>') {
advance(L)
advance(L)
return { type: 'OP', value: '&>', start, end: L.b }
}
if (c === '<' && c1 === '<' && c2 === '<') {
advance(L)
advance(L)
advance(L)
return { type: 'OP', value: '<<<', start, end: L.b }
}
if (c === '<' && c1 === '<' && c2 === '-') {
advance(L)
advance(L)
advance(L)
return { type: 'OP', value: '<<-', start, end: L.b }
}
if (c === '<' && c1 === '<') {
advance(L)
advance(L)
return { type: 'OP', value: '<<', start, end: L.b }
}
if (c === '<' && c1 === '&' && c2 === '-') {
advance(L)
advance(L)
advance(L)
return { type: 'OP', value: '<&-', start, end: L.b }
}
if (c === '<' && c1 === '&') {
advance(L)
advance(L)
return { type: 'OP', value: '<&', start, end: L.b }
}
if (c === '<' && c1 === '(') {
advance(L)
advance(L)
return { type: 'LT_PAREN', value: '<(', start, end: L.b }
}
if (c === '>' && c1 === '(') {
advance(L)
advance(L)
return { type: 'GT_PAREN', value: '>(', start, end: L.b }
}
if (c === '(' && c1 === '(') {
advance(L)
advance(L)
return { type: 'OP', value: '((', start, end: L.b }
}
if (c === ')' && c1 === ')') {
advance(L)
advance(L)
return { type: 'OP', value: '))', start, end: L.b }
}
if (c === '|' || c === '&' || c === ';' || c === '>' || c === '<') {
advance(L)
return { type: 'OP', value: c, start, end: L.b }
}
if (c === '(' || c === ')') {
advance(L)
return { type: 'OP', value: c, start, end: L.b }
}
// In cmd position, [ [[ { start test/group; in arg position they're word chars
if (ctx === 'cmd') {
if (c === '[' && c1 === '[') {
advance(L)
advance(L)
return { type: 'OP', value: '[[', start, end: L.b }
}
if (c === '[') {
advance(L)
return { type: 'OP', value: '[', start, end: L.b }
}
if (c === '{' && (c1 === ' ' || c1 === '\t' || c1 === '\n')) {
advance(L)
return { type: 'OP', value: '{', start, end: L.b }
}
if (c === '}') {
advance(L)
return { type: 'OP', value: '}', start, end: L.b }
}
if (c === '!' && (c1 === ' ' || c1 === '\t')) {
advance(L)
return { type: 'OP', value: '!', start, end: L.b }
}
}
if (c === '"') {
advance(L)
return { type: 'DQUOTE', value: '"', start, end: L.b }
}
if (c === "'") {
const si = L.i
advance(L)
while (L.i < L.len && L.src[L.i] !== "'") advance(L)
if (L.i < L.len) advance(L)
return {
type: 'SQUOTE',
value: L.src.slice(si, L.i),
start,
end: L.b,
}
}
if (c === '$') {
if (c1 === '(' && c2 === '(') {
advance(L)
advance(L)
advance(L)
return { type: 'DOLLAR_DPAREN', value: '$((', start, end: L.b }
}
if (c1 === '(') {
advance(L)
advance(L)
return { type: 'DOLLAR_PAREN', value: '$(', start, end: L.b }
}
if (c1 === '{') {
advance(L)
advance(L)
return { type: 'DOLLAR_BRACE', value: '${', start, end: L.b }
}
if (c1 === "'") {
// ANSI-C string $'...'
const si = L.i
advance(L)
advance(L)
while (L.i < L.len && L.src[L.i] !== "'") {
if (L.src[L.i] === '\\' && L.i + 1 < L.len) advance(L)
advance(L)
}
if (L.i < L.len) advance(L)
return {
type: 'ANSI_C',
value: L.src.slice(si, L.i),
start,
end: L.b,
}
}
advance(L)
return { type: 'DOLLAR', value: '$', start, end: L.b }
}
if (c === '`') {
advance(L)
return { type: 'BACKTICK', value: '`', start, end: L.b }
}
// File descriptor before redirect: digit+ immediately followed by > or <
if (isDigit(c)) {
let j = L.i
while (j < L.len && isDigit(L.src[j]!)) j++
const after = j < L.len ? L.src[j]! : ''
if (after === '>' || after === '<') {
const si = L.i
while (L.i < j) advance(L)
return {
type: 'WORD',
value: L.src.slice(si, L.i),
start,
end: L.b,
}
}
}
// Word / number
if (isWordStart(c) || c === '{' || c === '}') {
const si = L.i
while (L.i < L.len) {
const ch = L.src[L.i]!
if (ch === '\\') {
if (L.i + 1 >= L.len) {
// Trailing `\` at EOF β tree-sitter excludes it from the word and
// emits a sibling ERROR. Stop here so the word ends before `\`.
break
}
// Escape next char (including \n for line continuation mid-word)
if (L.src[L.i + 1] === '\n') {
advance(L)
advance(L)
continue
}
advance(L)
advance(L)
continue
}
if (!isWordChar(ch) && ch !== '{' && ch !== '}') {
break
}
advance(L)
}
if (L.i > si) {
const v = L.src.slice(si, L.i)
// Number: optional sign then digits only
if (/^-?\d+$/.test(v)) {
return { type: 'NUMBER', value: v, start, end: L.b }
}
return { type: 'WORD', value: v, start, end: L.b }
}
// Empty word (lone `\` at EOF) β fall through to single-char consumer
}
// Unknown char β consume as single-char word
advance(L)
return { type: 'WORD', value: c, start, end: L.b }
}
// βββββββββββββββββββββββββββββ Parser βββββββββββββββββββββββββββββ
type ParseState = {
L: Lexer
src: string
srcBytes: number
/** True when byte offsets == char indices (no multi-byte UTF-8) */
isAscii: boolean
nodeCount: number
deadline: number
aborted: boolean
/** Depth of backtick nesting β inside `...`, ` terminates words */
inBacktick: number
/** When set, parseSimpleCommand stops at this token (for `[` backtrack) */
stopToken: string | null
}
function parseSource(source: string, timeoutMs?: number): TsNode | null {
const L = makeLexer(source)
const srcBytes = byteLengthUtf8(source)
const P: ParseState = {
L,
src: source,
srcBytes,
isAscii: srcBytes === source.length,
nodeCount: 0,
deadline: performance.now() + (timeoutMs ?? PARSE_TIMEOUT_MS),
aborted: false,
inBacktick: 0,
stopToken: null,
}
try {
const program = parseProgram(P)
if (P.aborted) return null
return program
} catch {
return null
}
}
function byteLengthUtf8(s: string): number {
let b = 0
for (let i = 0; i < s.length; i++) {
const c = s.charCodeAt(i)
if (c < 0x80) b++
else if (c < 0x800) b += 2
else if (c >= 0xd800 && c <= 0xdbff) {
b += 4
i++
} else b += 3
}
return b
}
function checkBudget(P: ParseState): void {
P.nodeCount++
if (P.nodeCount > MAX_NODES) {
P.aborted = true
throw new Error('budget')
}
if ((P.nodeCount & 0x7f) === 0 && performance.now() > P.deadline) {
P.aborted = true
throw new Error('timeout')
}
}
/** Build a node. Slices text from source by byte range via char-index lookup. */
function mk(
P: ParseState,
type: string,
start: number,
end: number,
children: TsNode[],
): TsNode {
checkBudget(P)
return {
type,
text: sliceBytes(P, start, end),
startIndex: start,
endIndex: end,
children,
}
}
function sliceBytes(P: ParseState, startByte: number, endByte: number): string {
if (P.isAscii) return P.src.slice(startByte, endByte)
// Find char indices for byte offsets. Build byte table if needed.
const L = P.L
if (!L.byteTable) byteAt(L, 0)
const t = L.byteTable!
// Binary search for char index where byte offset matches
let lo = 0
let hi = P.src.length
while (lo < hi) {
const m = (lo + hi) >>> 1
if (t[m]! < startByte) lo = m + 1
else hi = m
}
const sc = lo
lo = sc
hi = P.src.length
while (lo < hi) {
const m = (lo + hi) >>> 1
if (t[m]! < endByte) lo = m + 1
else hi = m
}
return P.src.slice(sc, lo)
}
function leaf(P: ParseState, type: string, tok: Token): TsNode {
return mk(P, type, tok.start, tok.end, [])
}
function parseProgram(P: ParseState): TsNode {
const children: TsNode[] = []
// Skip leading whitespace & newlines β program start is first content byte
skipBlanks(P.L)
while (true) {
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'NEWLINE') {
skipBlanks(P.L)
continue
}
restoreLex(P.L, save)
break
}
const progStart = P.L.b
while (P.L.i < P.L.len) {
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'EOF') break
if (t.type === 'NEWLINE') continue
if (t.type === 'COMMENT') {
children.push(leaf(P, 'comment', t))
continue
}
restoreLex(P.L, save)
const stmts = parseStatements(P, null)
for (const s of stmts) children.push(s)
if (stmts.length === 0) {
// Couldn't parse β emit ERROR and skip one token
const errTok = nextToken(P.L, 'cmd')
if (errTok.type === 'EOF') break
// Stray `;;` at program level (e.g., `var=;;` outside case) β tree-sitter
// silently elides. Keep leading `;` as ERROR (security: paste artifact).
if (
errTok.type === 'OP' &&
errTok.value === ';;' &&
children.length > 0
) {
continue
}
children.push(mk(P, 'ERROR', errTok.start, errTok.end, []))
}
}
// tree-sitter includes trailing whitespace in program extent
const progEnd = children.length > 0 ? P.srcBytes : progStart
return mk(P, 'program', progStart, progEnd, children)
}
/** Packed as (b << 16) | i β avoids heap alloc on every backtrack. */
type LexSave = number
function saveLex(L: Lexer): LexSave {
return L.b * 0x10000 + L.i
}
function restoreLex(L: Lexer, s: LexSave): void {
L.i = s & 0xffff
L.b = s >>> 16
}
/**
* Parse a sequence of statements separated by ; & newline. Returns a flat list
* where ; and & are sibling leaves (NOT wrapped in 'list' β only && || get
* that). Stops at terminator or EOF.
*/
function parseStatements(P: ParseState, terminator: string | null): TsNode[] {
const out: TsNode[] = []
while (true) {
skipBlanks(P.L)
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'EOF') {
restoreLex(P.L, save)
break
}
if (t.type === 'NEWLINE') {
// Process pending heredocs
if (P.L.heredocs.length > 0) {
scanHeredocBodies(P)
}
continue
}
if (t.type === 'COMMENT') {
out.push(leaf(P, 'comment', t))
continue
}
if (terminator && t.type === 'OP' && t.value === terminator) {
restoreLex(P.L, save)
break
}
if (
t.type === 'OP' &&
(t.value === ')' ||
t.value === '}' ||
t.value === ';;' ||
t.value === ';&' ||
t.value === ';;&' ||
t.value === '))' ||
t.value === ']]' ||
t.value === ']')
) {
restoreLex(P.L, save)
break
}
if (t.type === 'BACKTICK' && P.inBacktick > 0) {
restoreLex(P.L, save)
break
}
if (
t.type === 'WORD' &&
(t.value === 'then' ||
t.value === 'elif' ||
t.value === 'else' ||
t.value === 'fi' ||
t.value === 'do' ||
t.value === 'done' ||
t.value === 'esac')
) {
restoreLex(P.L, save)
break
}
restoreLex(P.L, save)
const stmt = parseAndOr(P)
if (!stmt) break
out.push(stmt)
// Look for separator
skipBlanks(P.L)
const save2 = saveLex(P.L)
const sep = nextToken(P.L, 'cmd')
if (sep.type === 'OP' && (sep.value === ';' || sep.value === '&')) {
// Check if terminator follows β if so, emit separator but stop
const save3 = saveLex(P.L)
const after = nextToken(P.L, 'cmd')
restoreLex(P.L, save3)
out.push(leaf(P, sep.value, sep))
if (
after.type === 'EOF' ||
(after.type === 'OP' &&
(after.value === ')' ||
after.value === '}' ||
after.value === ';;' ||
after.value === ';&' ||
after.value === ';;&')) ||
(after.type === 'WORD' &&
(after.value === 'then' ||
after.value === 'elif' ||
after.value === 'else' ||
after.value === 'fi' ||
after.value === 'do' ||
after.value === 'done' ||
after.value === 'esac'))
) {
// Trailing separator β don't include it at program level unless
// there's content after. But at inner levels we keep it.
continue
}
} else if (sep.type === 'NEWLINE') {
if (P.L.heredocs.length > 0) {
scanHeredocBodies(P)
}
continue
} else {
restoreLex(P.L, save2)
}
}
// Trim trailing separator if at program level
return out
}
/**
* Parse pipeline chains joined by && ||. Left-associative nesting.
* tree-sitter quirk: trailing redirect on the last pipeline wraps the ENTIRE
* list in a redirected_statement β `a > x && b > y` becomes
* redirected_statement(list(redirected_statement(a,>x), &&, b), >y).
*/
function parseAndOr(P: ParseState): TsNode | null {
let left = parsePipeline(P)
if (!left) return null
while (true) {
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'OP' && (t.value === '&&' || t.value === '||')) {
const op = leaf(P, t.value, t)
skipNewlines(P)
const right = parsePipeline(P)
if (!right) {
left = mk(P, 'list', left.startIndex, op.endIndex, [left, op])
break
}
// If right is a redirected_statement, hoist its redirects to wrap the list.
if (right.type === 'redirected_statement' && right.children.length >= 2) {
const inner = right.children[0]!
const redirs = right.children.slice(1)
const listNode = mk(P, 'list', left.startIndex, inner.endIndex, [
left,
op,
inner,
])
const lastR = redirs[redirs.length - 1]!
left = mk(
P,
'redirected_statement',
listNode.startIndex,
lastR.endIndex,
[listNode, ...redirs],
)
} else {
left = mk(P, 'list', left.startIndex, right.endIndex, [left, op, right])
}
} else {
restoreLex(P.L, save)
break
}
}
return left
}
function skipNewlines(P: ParseState): void {
while (true) {
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type !== 'NEWLINE') {
restoreLex(P.L, save)
break
}
}
}
/**
* Parse commands joined by | or |&. Flat children with operator leaves.
* tree-sitter quirk: `a | b 2>nul | c` hoists the redirect on `b` to wrap
* the preceding pipeline fragment β pipeline(redirected_statement(
* pipeline(a,|,b), 2>nul), |, c).
*/
function parsePipeline(P: ParseState): TsNode | null {
let first = parseCommand(P)
if (!first) return null
const parts: TsNode[] = [first]
while (true) {
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'OP' && (t.value === '|' || t.value === '|&')) {
const op = leaf(P, t.value, t)
skipNewlines(P)
const next = parseCommand(P)
if (!next) {
parts.push(op)
break
}
// Hoist trailing redirect on `next` to wrap current pipeline fragment
if (
next.type === 'redirected_statement' &&
next.children.length >= 2 &&
parts.length >= 1
) {
const inner = next.children[0]!
const redirs = next.children.slice(1)
// Wrap existing parts + op + inner as a pipeline
const pipeKids = [...parts, op, inner]
const pipeNode = mk(
P,
'pipeline',
pipeKids[0]!.startIndex,
inner.endIndex,
pipeKids,
)
const lastR = redirs[redirs.length - 1]!
const wrapped = mk(
P,
'redirected_statement',
pipeNode.startIndex,
lastR.endIndex,
[pipeNode, ...redirs],
)
parts.length = 0
parts.push(wrapped)
first = wrapped
continue
}
parts.push(op, next)
} else {
restoreLex(P.L, save)
break
}
}
if (parts.length === 1) return parts[0]!
const last = parts[parts.length - 1]!
return mk(P, 'pipeline', parts[0]!.startIndex, last.endIndex, parts)
}
/** Parse a single command: simple, compound, or control structure. */
function parseCommand(P: ParseState): TsNode | null {
skipBlanks(P.L)
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'EOF') {
restoreLex(P.L, save)
return null
}
// Negation β tree-sitter wraps just the command, redirects go outside.
// `! cmd > out` β redirected_statement(negated_command(!, cmd), >out)
if (t.type === 'OP' && t.value === '!') {
const bang = leaf(P, '!', t)
const inner = parseCommand(P)
if (!inner) {
restoreLex(P.L, save)
return null
}
// If inner is a redirected_statement, hoist redirects outside negation
if (inner.type === 'redirected_statement' && inner.children.length >= 2) {
const cmd = inner.children[0]!
const redirs = inner.children.slice(1)
const neg = mk(P, 'negated_command', bang.startIndex, cmd.endIndex, [
bang,
cmd,
])
const lastR = redirs[redirs.length - 1]!
return mk(P, 'redirected_statement', neg.startIndex, lastR.endIndex, [
neg,
...redirs,
])
}
return mk(P, 'negated_command', bang.startIndex, inner.endIndex, [
bang,
inner,
])
}
if (t.type === 'OP' && t.value === '(') {
const open = leaf(P, '(', t)
const body = parseStatements(P, ')')
const closeTok = nextToken(P.L, 'cmd')
const close =
closeTok.type === 'OP' && closeTok.value === ')'
? leaf(P, ')', closeTok)
: mk(P, ')', open.endIndex, open.endIndex, [])
const node = mk(P, 'subshell', open.startIndex, close.endIndex, [
open,
...body,
close,
])
return maybeRedirect(P, node)
}
if (t.type === 'OP' && t.value === '((') {
const open = leaf(P, '((', t)
const exprs = parseArithCommaList(P, '))', 'var')
const closeTok = nextToken(P.L, 'cmd')
const close =
closeTok.value === '))'
? leaf(P, '))', closeTok)
: mk(P, '))', open.endIndex, open.endIndex, [])
return mk(P, 'compound_statement', open.startIndex, close.endIndex, [
open,
...exprs,
close,
])
}
if (t.type === 'OP' && t.value === '{') {
const open = leaf(P, '{', t)
const body = parseStatements(P, '}')
const closeTok = nextToken(P.L, 'cmd')
const close =
closeTok.type === 'OP' && closeTok.value === '}'
? leaf(P, '}', closeTok)
: mk(P, '}', open.endIndex, open.endIndex, [])
const node = mk(P, 'compound_statement', open.startIndex, close.endIndex, [
open,
...body,
close,
])
return maybeRedirect(P, node)
}
if (t.type === 'OP' && (t.value === '[' || t.value === '[[')) {
const open = leaf(P, t.value, t)
const closer = t.value === '[' ? ']' : ']]'
// Grammar: `[` can contain choice(_expression, redirected_statement).
// Try _expression first; if we don't reach `]`, backtrack and parse as
// redirected_statement (handles `[ ! cmd -v go &>/dev/null ]`).
const exprSave = saveLex(P.L)
let expr = parseTestExpr(P, closer)
skipBlanks(P.L)
if (t.value === '[' && peek(P.L) !== ']') {
// Expression parse didn't reach `]` β try as redirected_statement.
// Thread `]` stop-token so parseSimpleCommand doesn't eat it as arg.
restoreLex(P.L, exprSave)
const prevStop = P.stopToken
P.stopToken = ']'
const rstmt = parseCommand(P)
P.stopToken = prevStop
if (rstmt && rstmt.type === 'redirected_statement') {
expr = rstmt
} else {
// Neither worked β restore and keep the expression result
restoreLex(P.L, exprSave)
expr = parseTestExpr(P, closer)
}
skipBlanks(P.L)
}
const closeTok = nextToken(P.L, 'arg')
let close: TsNode
if (closeTok.value === closer) {
close = leaf(P, closer, closeTok)
} else {
close = mk(P, closer, open.endIndex, open.endIndex, [])
}
const kids = expr ? [open, expr, close] : [open, close]
return mk(P, 'test_command', open.startIndex, close.endIndex, kids)
}
if (t.type === 'WORD') {
if (t.value === 'if') return maybeRedirect(P, parseIf(P, t), true)
if (t.value === 'while' || t.value === 'until')
return maybeRedirect(P, parseWhile(P, t), true)
if (t.value === 'for') return maybeRedirect(P, parseFor(P, t), true)
if (t.value === 'select') return maybeRedirect(P, parseFor(P, t), true)
if (t.value === 'case') return maybeRedirect(P, parseCase(P, t), true)
if (t.value === 'function') return parseFunction(P, t)
if (DECL_KEYWORDS.has(t.value))
return maybeRedirect(P, parseDeclaration(P, t))
if (t.value === 'unset' || t.value === 'unsetenv') {
return maybeRedirect(P, parseUnset(P, t))
}
}
restoreLex(P.L, save)
return parseSimpleCommand(P)
}
/**
* Parse a simple command: [assignment]* word [arg|redirect]*
* Returns variable_assignment if only one assignment and no command.
*/
function parseSimpleCommand(P: ParseState): TsNode | null {
const start = P.L.b
const assignments: TsNode[] = []
const preRedirects: TsNode[] = []
while (true) {
skipBlanks(P.L)
const a = tryParseAssignment(P)
if (a) {
assignments.push(a)
continue
}
const r = tryParseRedirect(P)
if (r) {
preRedirects.push(r)
continue
}
break
}
skipBlanks(P.L)
const save = saveLex(P.L)
const nameTok = nextToken(P.L, 'cmd')
if (
nameTok.type === 'EOF' ||
nameTok.type === 'NEWLINE' ||
nameTok.type === 'COMMENT' ||
(nameTok.type === 'OP' &&
nameTok.value !== '{' &&
nameTok.value !== '[' &&
nameTok.value !== '[[') ||
(nameTok.type === 'WORD' &&
SHELL_KEYWORDS.has(nameTok.value) &&
nameTok.value !== 'in')
) {
restoreLex(P.L, save)
// No command β standalone assignment(s) or redirect
if (assignments.length === 1 && preRedirects.length === 0) {
return assignments[0]!
}
if (preRedirects.length > 0 && assignments.length === 0) {
// Bare redirect β redirected_statement with just file_redirect children
const last = preRedirects[preRedirects.length - 1]!
return mk(
P,
'redirected_statement',
preRedirects[0]!.startIndex,
last.endIndex,
preRedirects,
)
}
if (assignments.length > 1 && preRedirects.length === 0) {
// `A=1 B=2` with no command β variable_assignments (plural)
const last = assignments[assignments.length - 1]!
return mk(
P,
'variable_assignments',
assignments[0]!.startIndex,
last.endIndex,
assignments,
)
}
if (assignments.length > 0 || preRedirects.length > 0) {
const all = [...assignments, ...preRedirects]
const last = all[all.length - 1]!
return mk(P, 'command', start, last.endIndex, all)
}
return null
}
restoreLex(P.L, save)
// Check for function definition: name() { ... }
const fnSave = saveLex(P.L)
const nm = parseWord(P, 'cmd')
if (nm && nm.type === 'word') {
skipBlanks(P.L)
if (peek(P.L) === '(' && peek(P.L, 1) === ')') {
const oTok = nextToken(P.L, 'cmd')
const cTok = nextToken(P.L, 'cmd')
const oParen = leaf(P, '(', oTok)
const cParen = leaf(P, ')', cTok)
skipBlanks(P.L)
skipNewlines(P)
const body = parseCommand(P)
if (body) {
// If body is redirected_statement(compound_statement, file_redirect...),
// hoist redirects to function_definition level per tree-sitter grammar
let bodyKids: TsNode[] = [body]
if (
body.type === 'redirected_statement' &&
body.children.length >= 2 &&
body.children[0]!.type === 'compound_statement'
) {
bodyKids = body.children
}
const last = bodyKids[bodyKids.length - 1]!
return mk(P, 'function_definition', nm.startIndex, last.endIndex, [
nm,
oParen,
cParen,
...bodyKids,
])
}
}
}
restoreLex(P.L, fnSave)
const nameArg = parseWord(P, 'cmd')
if (!nameArg) {
if (assignments.length === 1) return assignments[0]!
return null
}
const cmdName = mk(P, 'command_name', nameArg.startIndex, nameArg.endIndex, [
nameArg,
])
const args: TsNode[] = []
const redirects: TsNode[] = []
let heredocRedirect: TsNode | null = null
while (true) {
skipBlanks(P.L)
// Post-command redirects are greedy (repeat1 $._literal) β once a redirect
// appears after command_name, subsequent literals attach to it per grammar's
// prec.left. `grep 2>/dev/null -q foo` β file_redirect eats `-q foo`.
// Args parsed BEFORE the first redirect still go to command (cat a b > out).
const r = tryParseRedirect(P, true)
if (r) {
if (r.type === 'heredoc_redirect') {
heredocRedirect = r
} else if (r.type === 'herestring_redirect') {
args.push(r)
} else {
redirects.push(r)
}
continue
}
// Once a file_redirect has been seen, command args are done β grammar's
// command rule doesn't allow file_redirect in its post-name choice, so
// anything after belongs to redirected_statement's file_redirect children.
if (redirects.length > 0) break
// `[` test_command backtrack β stop at `]` so outer handler can consume it
if (P.stopToken === ']' && peek(P.L) === ']') break
const save2 = saveLex(P.L)
const pk = nextToken(P.L, 'arg')
if (
pk.type === 'EOF' ||
pk.type === 'NEWLINE' ||
pk.type === 'COMMENT' ||
(pk.type === 'OP' &&
(pk.value === '|' ||
pk.value === '|&' ||
pk.value === '&&' ||
pk.value === '||' ||
pk.value === ';' ||
pk.value === ';;' ||
pk.value === ';&' ||
pk.value === ';;&' ||
pk.value === '&' ||
pk.value === ')' ||
pk.value === '}' ||
pk.value === '))'))
) {
restoreLex(P.L, save2)
break
}
restoreLex(P.L, save2)
const arg = parseWord(P, 'arg')
if (!arg) {
// Lone `(` in arg position β tree-sitter parses this as subshell arg
// e.g., `echo =(cmd)` β command has ERROR(=), subshell(cmd) as args
if (peek(P.L) === '(') {
const oTok = nextToken(P.L, 'cmd')
const open = leaf(P, '(', oTok)
const body = parseStatements(P, ')')
const cTok = nextToken(P.L, 'cmd')
const close =
cTok.type === 'OP' && cTok.value === ')'
? leaf(P, ')', cTok)
: mk(P, ')', open.endIndex, open.endIndex, [])
args.push(
mk(P, 'subshell', open.startIndex, close.endIndex, [
open,
...body,
close,
]),
)
continue
}
break
}
// Lone `=` in arg position is a parse error in bash β tree-sitter wraps
// it in ERROR for recovery. Happens in `echo =(cmd)` (zsh process-sub).
if (arg.type === 'word' && arg.text === '=') {
args.push(mk(P, 'ERROR', arg.startIndex, arg.endIndex, [arg]))
continue
}
// Word immediately followed by `(` (no whitespace) is a parse error β
// bash doesn't allow glob-then-subshell adjacency. tree-sitter wraps the
// word in ERROR. Catches zsh glob qualifiers like `*.(e:'cmd':)`.
if (
(arg.type === 'word' || arg.type === 'concatenation') &&
peek(P.L) === '(' &&
P.L.b === arg.endIndex
) {
args.push(mk(P, 'ERROR', arg.startIndex, arg.endIndex, [arg]))
continue
}
args.push(arg)
}
// preRedirects (e.g., `2>&1 cat`, `<<<str cmd`) go INSIDE the command node
// before command_name per tree-sitter grammar, not in redirected_statement
const cmdChildren = [...assignments, ...preRedirects, cmdName, ...args]
const cmdEnd =
cmdChildren.length > 0
? cmdChildren[cmdChildren.length - 1]!.endIndex
: cmdName.endIndex
const cmdStart = cmdChildren[0]!.startIndex
const cmd = mk(P, 'command', cmdStart, cmdEnd, cmdChildren)
if (heredocRedirect) {
// Scan heredoc body now
scanHeredocBodies(P)
const hd = P.L.heredocs.shift()
if (hd && heredocRedirect.children.length >= 2) {
const bodyNode = mk(
P,
'heredoc_body',
hd.bodyStart,
hd.bodyEnd,
hd.quoted ? [] : parseHeredocBodyContent(P, hd.bodyStart, hd.bodyEnd),
)
const endNode = mk(P, 'heredoc_end', hd.endStart, hd.endEnd, [])
heredocRedirect.children.push(bodyNode, endNode)
heredocRedirect.endIndex = hd.endEnd
heredocRedirect.text = sliceBytes(
P,
heredocRedirect.startIndex,
hd.endEnd,
)
}
const allR = [...preRedirects, heredocRedirect, ...redirects]
const rStart =
preRedirects.length > 0
? Math.min(cmd.startIndex, preRedirects[0]!.startIndex)
: cmd.startIndex
return mk(P, 'redirected_statement', rStart, heredocRedirect.endIndex, [
cmd,
...allR,
])
}
if (redirects.length > 0) {
const last = redirects[redirects.length - 1]!
return mk(P, 'redirected_statement', cmd.startIndex, last.endIndex, [
cmd,
...redirects,
])
}
return cmd
}
function maybeRedirect(
P: ParseState,
node: TsNode,
allowHerestring = false,
): TsNode {
const redirects: TsNode[] = []
while (true) {
skipBlanks(P.L)
const save = saveLex(P.L)
const r = tryParseRedirect(P)
if (!r) break
if (r.type === 'herestring_redirect' && !allowHerestring) {
restoreLex(P.L, save)
break
}
redirects.push(r)
}
if (redirects.length === 0) return node
const last = redirects[redirects.length - 1]!
return mk(P, 'redirected_statement', node.startIndex, last.endIndex, [
node,
...redirects,
])
}
function tryParseAssignment(P: ParseState): TsNode | null {
const save = saveLex(P.L)
skipBlanks(P.L)
const startB = P.L.b
// Must start with identifier
if (!isIdentStart(peek(P.L))) {
restoreLex(P.L, save)
return null
}
while (isIdentChar(peek(P.L))) advance(P.L)
const nameEnd = P.L.b
// Optional subscript
let subEnd = nameEnd
if (peek(P.L) === '[') {
advance(P.L)
let depth = 1
while (P.L.i < P.L.len && depth > 0) {
const c = peek(P.L)
if (c === '[') depth++
else if (c === ']') depth--
advance(P.L)
}
subEnd = P.L.b
}
const c = peek(P.L)
const c1 = peek(P.L, 1)
let op: string
if (c === '=' && c1 !== '=') {
op = '='
} else if (c === '+' && c1 === '=') {
op = '+='
} else {
restoreLex(P.L, save)
return null
}
const nameNode = mk(P, 'variable_name', startB, nameEnd, [])
// Subscript handling: wrap in subscript node if present
let lhs: TsNode = nameNode
if (subEnd > nameEnd) {
const brOpen = mk(P, '[', nameEnd, nameEnd + 1, [])
const idx = parseSubscriptIndex(P, nameEnd + 1, subEnd - 1)
const brClose = mk(P, ']', subEnd - 1, subEnd, [])
lhs = mk(P, 'subscript', startB, subEnd, [nameNode, brOpen, idx, brClose])
}
const opStart = P.L.b
advance(P.L)
if (op === '+=') advance(P.L)
const opEnd = P.L.b
const opNode = mk(P, op, opStart, opEnd, [])
let val: TsNode | null = null
if (peek(P.L) === '(') {
// Array
const aoTok = nextToken(P.L, 'cmd')
const aOpen = leaf(P, '(', aoTok)
const elems: TsNode[] = [aOpen]
while (true) {
skipBlanks(P.L)
if (peek(P.L) === ')') break
const e = parseWord(P, 'arg')
if (!e) break
elems.push(e)
}
const acTok = nextToken(P.L, 'cmd')
const aClose =
acTok.value === ')'
? leaf(P, ')', acTok)
: mk(P, ')', aOpen.endIndex, aOpen.endIndex, [])
elems.push(aClose)
val = mk(P, 'array', aOpen.startIndex, aClose.endIndex, elems)
} else {
const c2 = peek(P.L)
if (
c2 &&
c2 !== ' ' &&
c2 !== '\t' &&
c2 !== '\n' &&
c2 !== ';' &&
c2 !== '&' &&
c2 !== '|' &&
c2 !== ')' &&
c2 !== '}'
) {
val = parseWord(P, 'arg')
}
}
const kids = val ? [lhs, opNode, val] : [lhs, opNode]
const end = val ? val.endIndex : opEnd
return mk(P, 'variable_assignment', startB, end, kids)
}
/**
* Parse subscript index content. Parsed arithmetically per tree-sitter grammar:
* `${a[1+2]}` β binary_expression; `${a[++i]}` β unary_expression(word);
* `${a[(($n+1))]}` β compound_statement(binary_expression). Falls back to
* simple patterns (@, *) as word.
*/
function parseSubscriptIndexInline(P: ParseState): TsNode | null {
skipBlanks(P.L)
const c = peek(P.L)
// @ or * alone β word (associative array all-keys)
if ((c === '@' || c === '*') && peek(P.L, 1) === ']') {
const s = P.L.b
advance(P.L)
return mk(P, 'word', s, P.L.b, [])
}
// ((expr)) β compound_statement wrapping the inner arithmetic
if (c === '(' && peek(P.L, 1) === '(') {
const oStart = P.L.b
advance(P.L)
advance(P.L)
const open = mk(P, '((', oStart, P.L.b, [])
const inner = parseArithExpr(P, '))', 'var')
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ')' && peek(P.L, 1) === ')') {
const cs = P.L.b
advance(P.L)
advance(P.L)
close = mk(P, '))', cs, P.L.b, [])
} else {
close = mk(P, '))', P.L.b, P.L.b, [])
}
const kids = inner ? [open, inner, close] : [open, close]
return mk(P, 'compound_statement', open.startIndex, close.endIndex, kids)
}
// Arithmetic β but bare identifiers in subscript use 'word' mode per
// tree-sitter (${words[++counter]} β unary_expression(word)).
return parseArithExpr(P, ']', 'word')
}
/** Legacy byte-range subscript index parser β kept for callers that pre-scan. */
function parseSubscriptIndex(
P: ParseState,
startB: number,
endB: number,
): TsNode {
const text = sliceBytes(P, startB, endB)
if (/^\d+$/.test(text)) return mk(P, 'number', startB, endB, [])
const m = /^\$([a-zA-Z_]\w*)$/.exec(text)
if (m) {
const dollar = mk(P, '$', startB, startB + 1, [])
const vn = mk(P, 'variable_name', startB + 1, endB, [])
return mk(P, 'simple_expansion', startB, endB, [dollar, vn])
}
if (text.length === 2 && text[0] === '$' && SPECIAL_VARS.has(text[1]!)) {
const dollar = mk(P, '$', startB, startB + 1, [])
const vn = mk(P, 'special_variable_name', startB + 1, endB, [])
return mk(P, 'simple_expansion', startB, endB, [dollar, vn])
}
return mk(P, 'word', startB, endB, [])
}
/**
* Can the current position start a redirect destination literal?
* Returns false at redirect ops, terminators, or file-descriptor-prefixed ops
* so file_redirect's repeat1($._literal) stops at the right boundary.
*/
function isRedirectLiteralStart(P: ParseState): boolean {
const c = peek(P.L)
if (c === '' || c === '\n') return false
// Shell terminators and operators
if (c === '|' || c === '&' || c === ';' || c === '(' || c === ')')
return false
// Redirect operators (< > with any suffix; <( >( handled by caller)
if (c === '<' || c === '>') {
// <( >( are process substitutions β those ARE literals
return peek(P.L, 1) === '('
}
// N< N> file descriptor prefix β starts a new redirect, not a literal
if (isDigit(c)) {
let j = P.L.i
while (j < P.L.len && isDigit(P.L.src[j]!)) j++
const after = j < P.L.len ? P.L.src[j]! : ''
if (after === '>' || after === '<') return false
}
// `}` only terminates if we're in a context where it's a closer β but
// file_redirect sees `}` as word char (e.g., `>$HOME}` is valid path char).
// Actually `}` at top level terminates compound_statement β need to stop.
if (c === '}') return false
// Test command closer β when parseSimpleCommand is called from `[` context,
// `]` must terminate so parseCommand can return and `[` handler consume it.
if (P.stopToken === ']' && c === ']') return false
return true
}
/**
* Parse a redirect operator + destination(s).
* @param greedy When true, file_redirect consumes repeat1($._literal) per
* grammar's prec.left β `cmd >f a b c` attaches `a b c` to the redirect.
* When false (preRedirect context), takes only 1 destination because
* command's dynamic precedence beats redirected_statement's prec(-1).
*/
function tryParseRedirect(P: ParseState, greedy = false): TsNode | null {
const save = saveLex(P.L)
skipBlanks(P.L)
// File descriptor prefix?
let fd: TsNode | null = null
if (isDigit(peek(P.L))) {
const startB = P.L.b
let j = P.L.i
while (j < P.L.len && isDigit(P.L.src[j]!)) j++
const after = j < P.L.len ? P.L.src[j]! : ''
if (after === '>' || after === '<') {
while (P.L.i < j) advance(P.L)
fd = mk(P, 'file_descriptor', startB, P.L.b, [])
}
}
const t = nextToken(P.L, 'arg')
if (t.type !== 'OP') {
restoreLex(P.L, save)
return null
}
const v = t.value
if (v === '<<<') {
const op = leaf(P, '<<<', t)
skipBlanks(P.L)
const target = parseWord(P, 'arg')
const end = target ? target.endIndex : op.endIndex
const kids = target ? [op, target] : [op]
return mk(
P,
'herestring_redirect',
fd ? fd.startIndex : op.startIndex,
end,
fd ? [fd, ...kids] : kids,
)
}
if (v === '<<' || v === '<<-') {
const op = leaf(P, v, t)
// Heredoc start β delimiter word (may be quoted)
skipBlanks(P.L)
const dStart = P.L.b
let quoted = false
let delim = ''
const dc = peek(P.L)
if (dc === "'" || dc === '"') {
quoted = true
advance(P.L)
while (P.L.i < P.L.len && peek(P.L) !== dc) {
delim += peek(P.L)
advance(P.L)
}
if (P.L.i < P.L.len) advance(P.L)
} else if (dc === '\\') {
// Backslash-escaped delimiter: \X β exactly one escaped char, body is
// quoted (literal). Covers <<\EOF <<\' <<\\ etc.
quoted = true
advance(P.L)
if (P.L.i < P.L.len && peek(P.L) !== '\n') {
delim += peek(P.L)
advance(P.L)
}
// May be followed by more ident chars (e.g. <<\EOF β delim "EOF")
while (P.L.i < P.L.len && isIdentChar(peek(P.L))) {
delim += peek(P.L)
advance(P.L)
}
} else {
// Unquoted delimiter: bash accepts most non-metacharacters (not just
// identifiers). Allow !, -, ., etc. β stop at shell metachars.
while (P.L.i < P.L.len && isHeredocDelimChar(peek(P.L))) {
delim += peek(P.L)
advance(P.L)
}
}
const dEnd = P.L.b
const startNode = mk(P, 'heredoc_start', dStart, dEnd, [])
// Register pending heredoc β body scanned at next newline
P.L.heredocs.push({
delim,
stripTabs: v === '<<-',
quoted,
bodyStart: 0,
bodyEnd: 0,
endStart: 0,
endEnd: 0,
})
const kids = fd ? [fd, op, startNode] : [op, startNode]
const startIdx = fd ? fd.startIndex : op.startIndex
// SECURITY: tree-sitter nests any pipeline/list/file_redirect appearing
// between heredoc_start and the newline as a CHILD of heredoc_redirect.
// `ls <<'EOF' | rm -rf /tmp/evil` must not silently drop the rm. Parse
// trailing words and file_redirects properly (ast.ts walkHeredocRedirect
// fails closed on any unrecognized child via tooComplex). Pipeline / list
// operators (| && || ;) are structurally complex β emit ERROR so the same
// fail-closed path rejects them.
while (true) {
skipBlanks(P.L)
const tc = peek(P.L)
if (tc === '\n' || tc === '' || P.L.i >= P.L.len) break
// File redirect after delimiter: cat <<EOF > out.txt
if (tc === '>' || tc === '<' || isDigit(tc)) {
const rSave = saveLex(P.L)
const r = tryParseRedirect(P)
if (r && r.type === 'file_redirect') {
kids.push(r)
continue
}
restoreLex(P.L, rSave)
}
// Pipeline after heredoc_start: `one <<EOF | grep two` β tree-sitter
// nests the pipeline as a child of heredoc_redirect. ast.ts
// walkHeredocRedirect fails closed on pipeline/command via tooComplex.
if (tc === '|' && peek(P.L, 1) !== '|') {
advance(P.L)
skipBlanks(P.L)
const pipeCmds: TsNode[] = []
while (true) {
const cmd = parseCommand(P)
if (!cmd) break
pipeCmds.push(cmd)
skipBlanks(P.L)
if (peek(P.L) === '|' && peek(P.L, 1) !== '|') {
const ps = P.L.b
advance(P.L)
pipeCmds.push(mk(P, '|', ps, P.L.b, []))
skipBlanks(P.L)
continue
}
break
}
if (pipeCmds.length > 0) {
const pl = pipeCmds[pipeCmds.length - 1]!
// tree-sitter always wraps in pipeline after `|`, even single command
kids.push(
mk(P, 'pipeline', pipeCmds[0]!.startIndex, pl.endIndex, pipeCmds),
)
}
continue
}
// && / || after heredoc_start: `cat <<-EOF || die "..."` β tree-sitter
// nests just the RHS command (not a list) as a child of heredoc_redirect.
if (
(tc === '&' && peek(P.L, 1) === '&') ||
(tc === '|' && peek(P.L, 1) === '|')
) {
advance(P.L)
advance(P.L)
skipBlanks(P.L)
const rhs = parseCommand(P)
if (rhs) kids.push(rhs)
continue
}
// Terminator / unhandled metachar β consume rest of line as ERROR so
// ast.ts rejects it. Covers ; & ( )
if (tc === '&' || tc === ';' || tc === '(' || tc === ')') {
const eStart = P.L.b
while (P.L.i < P.L.len && peek(P.L) !== '\n') advance(P.L)
kids.push(mk(P, 'ERROR', eStart, P.L.b, []))
break
}
// Trailing word argument: newins <<-EOF - org.freedesktop.service
const w = parseWord(P, 'arg')
if (w) {
kids.push(w)
continue
}
// Unrecognized β consume rest of line as ERROR
const eStart = P.L.b
while (P.L.i < P.L.len && peek(P.L) !== '\n') advance(P.L)
if (P.L.b > eStart) kids.push(mk(P, 'ERROR', eStart, P.L.b, []))
break
}
return mk(P, 'heredoc_redirect', startIdx, P.L.b, kids)
}
// Close-fd variants: `<&-` `>&-` have OPTIONAL destination (0 or 1)
if (v === '<&-' || v === '>&-') {
const op = leaf(P, v, t)
const kids: TsNode[] = []
if (fd) kids.push(fd)
kids.push(op)
// Optional single destination β only consume if next is a literal
skipBlanks(P.L)
const dSave = saveLex(P.L)
const dest = isRedirectLiteralStart(P) ? parseWord(P, 'arg') : null
if (dest) {
kids.push(dest)
} else {
restoreLex(P.L, dSave)
}
const startIdx = fd ? fd.startIndex : op.startIndex
const end = dest ? dest.endIndex : op.endIndex
return mk(P, 'file_redirect', startIdx, end, kids)
}
if (
v === '>' ||
v === '>>' ||
v === '>&' ||
v === '>|' ||
v === '&>' ||
v === '&>>' ||
v === '<' ||
v === '<&'
) {
const op = leaf(P, v, t)
const kids: TsNode[] = []
if (fd) kids.push(fd)
kids.push(op)
// Grammar: destination is repeat1($._literal) β greedily consume literals
// until a non-literal (redirect op, terminator, etc). tree-sitter's
// prec.left makes `cmd >f a b c` attach `a b c` to the file_redirect,
// NOT to the command. Structural quirk but required for corpus parity.
// In preRedirect context (greedy=false), take only 1 literal because
// command's dynamic precedence beats redirected_statement's prec(-1).
let end = op.endIndex
let taken = 0
while (true) {
skipBlanks(P.L)
if (!isRedirectLiteralStart(P)) break
if (!greedy && taken >= 1) break
const tc = peek(P.L)
const tc1 = peek(P.L, 1)
let target: TsNode | null = null
if ((tc === '<' || tc === '>') && tc1 === '(') {
target = parseProcessSub(P)
} else {
target = parseWord(P, 'arg')
}
if (!target) break
kids.push(target)
end = target.endIndex
taken++
}
const startIdx = fd ? fd.startIndex : op.startIndex
return mk(P, 'file_redirect', startIdx, end, kids)
}
restoreLex(P.L, save)
return null
}
function parseProcessSub(P: ParseState): TsNode | null {
const c = peek(P.L)
if ((c !== '<' && c !== '>') || peek(P.L, 1) !== '(') return null
const start = P.L.b
advance(P.L)
advance(P.L)
const open = mk(P, c + '(', start, P.L.b, [])
const body = parseStatements(P, ')')
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ')') {
const cs = P.L.b
advance(P.L)
close = mk(P, ')', cs, P.L.b, [])
} else {
close = mk(P, ')', P.L.b, P.L.b, [])
}
return mk(P, 'process_substitution', start, close.endIndex, [
open,
...body,
close,
])
}
function scanHeredocBodies(P: ParseState): void {
// Skip to newline if not already there
while (P.L.i < P.L.len && P.L.src[P.L.i] !== '\n') advance(P.L)
if (P.L.i < P.L.len) advance(P.L)
for (const hd of P.L.heredocs) {
hd.bodyStart = P.L.b
const delimLen = hd.delim.length
while (P.L.i < P.L.len) {
const lineStart = P.L.i
const lineStartB = P.L.b
// Skip leading tabs if <<-
let checkI = lineStart
if (hd.stripTabs) {
while (checkI < P.L.len && P.L.src[checkI] === '\t') checkI++
}
// Check if this line is the delimiter
if (
P.L.src.startsWith(hd.delim, checkI) &&
(checkI + delimLen >= P.L.len ||
P.L.src[checkI + delimLen] === '\n' ||
P.L.src[checkI + delimLen] === '\r')
) {
hd.bodyEnd = lineStartB
// Advance past tabs
while (P.L.i < checkI) advance(P.L)
hd.endStart = P.L.b
// Advance past delimiter
for (let k = 0; k < delimLen; k++) advance(P.L)
hd.endEnd = P.L.b
// Skip trailing newline
if (P.L.i < P.L.len && P.L.src[P.L.i] === '\n') advance(P.L)
return
}
// Consume line
while (P.L.i < P.L.len && P.L.src[P.L.i] !== '\n') advance(P.L)
if (P.L.i < P.L.len) advance(P.L)
}
// Unterminated
hd.bodyEnd = P.L.b
hd.endStart = P.L.b
hd.endEnd = P.L.b
}
}
function parseHeredocBodyContent(
P: ParseState,
start: number,
end: number,
): TsNode[] {
// Parse expansions inside an unquoted heredoc body.
const saved = saveLex(P.L)
// Position lexer at body start
restoreLexToByte(P, start)
const out: TsNode[] = []
let contentStart = P.L.b
// tree-sitter-bash's heredoc_body rule hides the initial text segment
// (_heredoc_body_beginning) β only content AFTER the first expansion is
// emitted as heredoc_content. Track whether we've seen an expansion yet.
let sawExpansion = false
while (P.L.b < end) {
const c = peek(P.L)
// Backslash escapes suppress expansion: \$ \` stay literal in heredoc.
if (c === '\\') {
const nxt = peek(P.L, 1)
if (nxt === '$' || nxt === '`' || nxt === '\\') {
advance(P.L)
advance(P.L)
continue
}
advance(P.L)
continue
}
if (c === '$' || c === '`') {
const preB = P.L.b
const exp = parseDollarLike(P)
// Bare `$` followed by non-name (e.g. `$'` in a regex) returns a lone
// '$' leaf, not an expansion β treat as literal content, don't split.
if (
exp &&
(exp.type === 'simple_expansion' ||
exp.type === 'expansion' ||
exp.type === 'command_substitution' ||
exp.type === 'arithmetic_expansion')
) {
if (sawExpansion && preB > contentStart) {
out.push(mk(P, 'heredoc_content', contentStart, preB, []))
}
out.push(exp)
contentStart = P.L.b
sawExpansion = true
}
continue
}
advance(P.L)
}
// Only emit heredoc_content children if there were expansions β otherwise
// the heredoc_body is a leaf node (tree-sitter convention).
if (sawExpansion) {
out.push(mk(P, 'heredoc_content', contentStart, end, []))
}
restoreLex(P.L, saved)
return out
}
function restoreLexToByte(P: ParseState, targetByte: number): void {
if (!P.L.byteTable) byteAt(P.L, 0)
const t = P.L.byteTable!
let lo = 0
let hi = P.src.length
while (lo < hi) {
const m = (lo + hi) >>> 1
if (t[m]! < targetByte) lo = m + 1
else hi = m
}
P.L.i = lo
P.L.b = targetByte
}
/**
* Parse a word-position element: bare word, string, expansion, or concatenation
* thereof. Returns a single node; if multiple adjacent fragments, wraps in
* concatenation.
*/
function parseWord(P: ParseState, _ctx: 'cmd' | 'arg'): TsNode | null {
skipBlanks(P.L)
const parts: TsNode[] = []
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (
c === ' ' ||
c === '\t' ||
c === '\n' ||
c === '\r' ||
c === '' ||
c === '|' ||
c === '&' ||
c === ';' ||
c === '(' ||
c === ')'
) {
break
}
// < > are redirect operators unless <( >( (process substitution)
if (c === '<' || c === '>') {
if (peek(P.L, 1) === '(') {
const ps = parseProcessSub(P)
if (ps) parts.push(ps)
continue
}
break
}
if (c === '"') {
parts.push(parseDoubleQuoted(P))
continue
}
if (c === "'") {
const tok = nextToken(P.L, 'arg')
parts.push(leaf(P, 'raw_string', tok))
continue
}
if (c === '$') {
const c1 = peek(P.L, 1)
if (c1 === "'") {
const tok = nextToken(P.L, 'arg')
parts.push(leaf(P, 'ansi_c_string', tok))
continue
}
if (c1 === '"') {
// Translated string: emit $ leaf + string node
const dTok: Token = {
type: 'DOLLAR',
value: '$',
start: P.L.b,
end: P.L.b + 1,
}
advance(P.L)
parts.push(leaf(P, '$', dTok))
parts.push(parseDoubleQuoted(P))
continue
}
if (c1 === '`') {
// `$` followed by backtick β tree-sitter elides the $ entirely
// and emits just (command_substitution). Consume $ and let next
// iteration handle the backtick.
advance(P.L)
continue
}
const exp = parseDollarLike(P)
if (exp) parts.push(exp)
continue
}
if (c === '`') {
if (P.inBacktick > 0) break
const bt = parseBacktick(P)
if (bt) parts.push(bt)
continue
}
// Brace expression {1..5} or {a,b,c} β only if looks like one
if (c === '{') {
const be = tryParseBraceExpr(P)
if (be) {
parts.push(be)
continue
}
// SECURITY: if `{` is immediately followed by a command terminator
// (; | & newline or EOF), it's a standalone word β don't slurp the
// rest of the line via tryParseBraceLikeCat. `echo {;touch /tmp/evil`
// must split on `;` so the security walker sees `touch`.
const nc = peek(P.L, 1)
if (
nc === ';' ||
nc === '|' ||
nc === '&' ||
nc === '\n' ||
nc === '' ||
nc === ')' ||
nc === ' ' ||
nc === '\t'
) {
const bStart = P.L.b
advance(P.L)
parts.push(mk(P, 'word', bStart, P.L.b, []))
continue
}
// Otherwise treat { and } as word fragments
const cat = tryParseBraceLikeCat(P)
if (cat) {
for (const p of cat) parts.push(p)
continue
}
}
// Standalone `}` in arg position is a word (e.g., `echo }foo`).
// parseBareWord breaks on `}` so handle it here.
if (c === '}') {
const bStart = P.L.b
advance(P.L)
parts.push(mk(P, 'word', bStart, P.L.b, []))
continue
}
// `[` and `]` are single-char word fragments (tree-sitter splits at
// brackets: `[:lower:]` β `[` `:lower:` `]`, `{o[k]}` β 6 words).
if (c === '[' || c === ']') {
const bStart = P.L.b
advance(P.L)
parts.push(mk(P, 'word', bStart, P.L.b, []))
continue
}
// Bare word fragment
const frag = parseBareWord(P)
if (!frag) break
// `NN#${...}` or `NN#$(...)` β (number (expansion|command_substitution)).
// Grammar: number can be seq(/-?(0x)?[0-9]+#/, choice(expansion, cmd_sub)).
// `10#${cmd}` must NOT be concatenation β it's a single number node with
// the expansion as child. Detect here: frag ends with `#`, next is $ {/(.
if (
frag.type === 'word' &&
/^-?(0x)?[0-9]+#$/.test(frag.text) &&
peek(P.L) === '$' &&
(peek(P.L, 1) === '{' || peek(P.L, 1) === '(')
) {
const exp = parseDollarLike(P)
if (exp) {
// Prefix `NN#` is an anonymous pattern in grammar β only the
// expansion/cmd_sub is a named child.
parts.push(mk(P, 'number', frag.startIndex, exp.endIndex, [exp]))
continue
}
}
parts.push(frag)
}
if (parts.length === 0) return null
if (parts.length === 1) return parts[0]!
// Concatenation
const first = parts[0]!
const last = parts[parts.length - 1]!
return mk(P, 'concatenation', first.startIndex, last.endIndex, parts)
}
function parseBareWord(P: ParseState): TsNode | null {
const start = P.L.b
const startI = P.L.i
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\\') {
if (P.L.i + 1 >= P.L.len) {
// Trailing unpaired `\` at true EOF β tree-sitter emits word WITHOUT
// the `\` plus a sibling ERROR node. Stop here; caller emits ERROR.
break
}
const nx = P.L.src[P.L.i + 1]
if (nx === '\n' || (nx === '\r' && P.L.src[P.L.i + 2] === '\n')) {
// Line continuation BREAKS the word (tree-sitter quirk) β handles \r?\n
break
}
advance(P.L)
advance(P.L)
continue
}
if (
c === ' ' ||
c === '\t' ||
c === '\n' ||
c === '\r' ||
c === '' ||
c === '|' ||
c === '&' ||
c === ';' ||
c === '(' ||
c === ')' ||
c === '<' ||
c === '>' ||
c === '"' ||
c === "'" ||
c === '$' ||
c === '`' ||
c === '{' ||
c === '}' ||
c === '[' ||
c === ']'
) {
break
}
advance(P.L)
}
if (P.L.b === start) return null
const text = P.src.slice(startI, P.L.i)
const type = /^-?\d+$/.test(text) ? 'number' : 'word'
return mk(P, type, start, P.L.b, [])
}
function tryParseBraceExpr(P: ParseState): TsNode | null {
// {N..M} where N, M are numbers or single chars
const save = saveLex(P.L)
if (peek(P.L) !== '{') return null
const oStart = P.L.b
advance(P.L)
const oEnd = P.L.b
// First part
const p1Start = P.L.b
while (isDigit(peek(P.L)) || isIdentStart(peek(P.L))) advance(P.L)
const p1End = P.L.b
if (p1End === p1Start || peek(P.L) !== '.' || peek(P.L, 1) !== '.') {
restoreLex(P.L, save)
return null
}
const dotStart = P.L.b
advance(P.L)
advance(P.L)
const dotEnd = P.L.b
const p2Start = P.L.b
while (isDigit(peek(P.L)) || isIdentStart(peek(P.L))) advance(P.L)
const p2End = P.L.b
if (p2End === p2Start || peek(P.L) !== '}') {
restoreLex(P.L, save)
return null
}
const cStart = P.L.b
advance(P.L)
const cEnd = P.L.b
const p1Text = sliceBytes(P, p1Start, p1End)
const p2Text = sliceBytes(P, p2Start, p2End)
const p1IsNum = /^\d+$/.test(p1Text)
const p2IsNum = /^\d+$/.test(p2Text)
// Valid brace expression: both numbers OR both single chars. Mixed = reject.
if (p1IsNum !== p2IsNum) {
restoreLex(P.L, save)
return null
}
if (!p1IsNum && (p1Text.length !== 1 || p2Text.length !== 1)) {
restoreLex(P.L, save)
return null
}
const p1Type = p1IsNum ? 'number' : 'word'
const p2Type = p2IsNum ? 'number' : 'word'
return mk(P, 'brace_expression', oStart, cEnd, [
mk(P, '{', oStart, oEnd, []),
mk(P, p1Type, p1Start, p1End, []),
mk(P, '..', dotStart, dotEnd, []),
mk(P, p2Type, p2Start, p2End, []),
mk(P, '}', cStart, cEnd, []),
])
}
function tryParseBraceLikeCat(P: ParseState): TsNode[] | null {
// {a,b,c} or {} β split into word fragments like tree-sitter does
if (peek(P.L) !== '{') return null
const oStart = P.L.b
advance(P.L)
const oEnd = P.L.b
const inner: TsNode[] = [mk(P, 'word', oStart, oEnd, [])]
while (P.L.i < P.L.len) {
const bc = peek(P.L)
// SECURITY: stop at command terminators so `{foo;rm x` splits correctly.
if (
bc === '}' ||
bc === '\n' ||
bc === ';' ||
bc === '|' ||
bc === '&' ||
bc === ' ' ||
bc === '\t' ||
bc === '<' ||
bc === '>' ||
bc === '(' ||
bc === ')'
) {
break
}
// `[` and `]` are single-char words: {o[k]} β { o [ k ] }
if (bc === '[' || bc === ']') {
const bStart = P.L.b
advance(P.L)
inner.push(mk(P, 'word', bStart, P.L.b, []))
continue
}
const midStart = P.L.b
while (P.L.i < P.L.len) {
const mc = peek(P.L)
if (
mc === '}' ||
mc === '\n' ||
mc === ';' ||
mc === '|' ||
mc === '&' ||
mc === ' ' ||
mc === '\t' ||
mc === '<' ||
mc === '>' ||
mc === '(' ||
mc === ')' ||
mc === '[' ||
mc === ']'
) {
break
}
advance(P.L)
}
const midEnd = P.L.b
if (midEnd > midStart) {
const midText = sliceBytes(P, midStart, midEnd)
const midType = /^-?\d+$/.test(midText) ? 'number' : 'word'
inner.push(mk(P, midType, midStart, midEnd, []))
} else {
break
}
}
if (peek(P.L) === '}') {
const cStart = P.L.b
advance(P.L)
inner.push(mk(P, 'word', cStart, P.L.b, []))
}
return inner
}
function parseDoubleQuoted(P: ParseState): TsNode {
const qStart = P.L.b
advance(P.L)
const qEnd = P.L.b
const openQ = mk(P, '"', qStart, qEnd, [])
const parts: TsNode[] = [openQ]
let contentStart = P.L.b
let contentStartI = P.L.i
const flushContent = (): void => {
if (P.L.b > contentStart) {
// Tree-sitter's extras rule /\s/ has higher precedence than
// string_content (prec -1), so whitespace-only segments are elided.
// `" ${x} "` β (string (expansion)) not (string (string_content)(expansion)(string_content)).
// Note: this intentionally diverges from preserving all content β cc
// tests relying on whitespace-only string_content need updating
// (CCReconcile).
const txt = P.src.slice(contentStartI, P.L.i)
if (!/^[ \t]+$/.test(txt)) {
parts.push(mk(P, 'string_content', contentStart, P.L.b, []))
}
}
}
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '"') break
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
if (c === '\n') {
// Split string_content at newline
flushContent()
advance(P.L)
contentStart = P.L.b
contentStartI = P.L.i
continue
}
if (c === '$') {
const c1 = peek(P.L, 1)
if (
c1 === '(' ||
c1 === '{' ||
isIdentStart(c1) ||
SPECIAL_VARS.has(c1) ||
isDigit(c1)
) {
flushContent()
const exp = parseDollarLike(P)
if (exp) parts.push(exp)
contentStart = P.L.b
contentStartI = P.L.i
continue
}
// Bare $ not at end-of-string: tree-sitter emits it as an anonymous
// '$' token, which splits string_content. $ immediately before the
// closing " is absorbed into the preceding string_content.
if (c1 !== '"' && c1 !== '') {
flushContent()
const dS = P.L.b
advance(P.L)
parts.push(mk(P, '$', dS, P.L.b, []))
contentStart = P.L.b
contentStartI = P.L.i
continue
}
}
if (c === '`') {
flushContent()
const bt = parseBacktick(P)
if (bt) parts.push(bt)
contentStart = P.L.b
contentStartI = P.L.i
continue
}
advance(P.L)
}
flushContent()
let close: TsNode
if (peek(P.L) === '"') {
const cStart = P.L.b
advance(P.L)
close = mk(P, '"', cStart, P.L.b, [])
} else {
close = mk(P, '"', P.L.b, P.L.b, [])
}
parts.push(close)
return mk(P, 'string', qStart, close.endIndex, parts)
}
function parseDollarLike(P: ParseState): TsNode | null {
const c1 = peek(P.L, 1)
const dStart = P.L.b
if (c1 === '(' && peek(P.L, 2) === '(') {
// $(( arithmetic ))
advance(P.L)
advance(P.L)
advance(P.L)
const open = mk(P, '$((', dStart, P.L.b, [])
const exprs = parseArithCommaList(P, '))', 'var')
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ')' && peek(P.L, 1) === ')') {
const cStart = P.L.b
advance(P.L)
advance(P.L)
close = mk(P, '))', cStart, P.L.b, [])
} else {
close = mk(P, '))', P.L.b, P.L.b, [])
}
return mk(P, 'arithmetic_expansion', dStart, close.endIndex, [
open,
...exprs,
close,
])
}
if (c1 === '[') {
// $[ arithmetic ] β legacy bash syntax, same as $((...))
advance(P.L)
advance(P.L)
const open = mk(P, '$[', dStart, P.L.b, [])
const exprs = parseArithCommaList(P, ']', 'var')
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ']') {
const cStart = P.L.b
advance(P.L)
close = mk(P, ']', cStart, P.L.b, [])
} else {
close = mk(P, ']', P.L.b, P.L.b, [])
}
return mk(P, 'arithmetic_expansion', dStart, close.endIndex, [
open,
...exprs,
close,
])
}
if (c1 === '(') {
advance(P.L)
advance(P.L)
const open = mk(P, '$(', dStart, P.L.b, [])
let body = parseStatements(P, ')')
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ')') {
const cStart = P.L.b
advance(P.L)
close = mk(P, ')', cStart, P.L.b, [])
} else {
close = mk(P, ')', P.L.b, P.L.b, [])
}
// $(< file) shorthand: unwrap redirected_statement β bare file_redirect
// tree-sitter emits (command_substitution (file_redirect (word))) directly
if (
body.length === 1 &&
body[0]!.type === 'redirected_statement' &&
body[0]!.children.length === 1 &&
body[0]!.children[0]!.type === 'file_redirect'
) {
body = body[0]!.children
}
return mk(P, 'command_substitution', dStart, close.endIndex, [
open,
...body,
close,
])
}
if (c1 === '{') {
advance(P.L)
advance(P.L)
const open = mk(P, '${', dStart, P.L.b, [])
const inner = parseExpansionBody(P)
let close: TsNode
if (peek(P.L) === '}') {
const cStart = P.L.b
advance(P.L)
close = mk(P, '}', cStart, P.L.b, [])
} else {
close = mk(P, '}', P.L.b, P.L.b, [])
}
return mk(P, 'expansion', dStart, close.endIndex, [open, ...inner, close])
}
// Simple expansion $VAR or $? $$ $@ etc
advance(P.L)
const dEnd = P.L.b
const dollar = mk(P, '$', dStart, dEnd, [])
const nc = peek(P.L)
// $_ is special_variable_name only when not followed by more ident chars
if (nc === '_' && !isIdentChar(peek(P.L, 1))) {
const vStart = P.L.b
advance(P.L)
const vn = mk(P, 'special_variable_name', vStart, P.L.b, [])
return mk(P, 'simple_expansion', dStart, P.L.b, [dollar, vn])
}
if (isIdentStart(nc)) {
const vStart = P.L.b
while (isIdentChar(peek(P.L))) advance(P.L)
const vn = mk(P, 'variable_name', vStart, P.L.b, [])
return mk(P, 'simple_expansion', dStart, P.L.b, [dollar, vn])
}
if (isDigit(nc)) {
const vStart = P.L.b
advance(P.L)
const vn = mk(P, 'variable_name', vStart, P.L.b, [])
return mk(P, 'simple_expansion', dStart, P.L.b, [dollar, vn])
}
if (SPECIAL_VARS.has(nc)) {
const vStart = P.L.b
advance(P.L)
const vn = mk(P, 'special_variable_name', vStart, P.L.b, [])
return mk(P, 'simple_expansion', dStart, P.L.b, [dollar, vn])
}
// Bare $ β just a $ leaf (tree-sitter treats trailing $ as literal)
return dollar
}
function parseExpansionBody(P: ParseState): TsNode[] {
const out: TsNode[] = []
skipBlanks(P.L)
// Bizarre cases: ${#!} ${!#} ${!##} ${!# } ${!## } all emit empty (expansion)
// β both # and ! become anonymous nodes when only combined with each other
// and optional trailing space before }. Note ${!##/} does NOT match (has
// content after), so it parses normally as (special_variable_name)(regex).
{
const c0 = peek(P.L)
const c1 = peek(P.L, 1)
if (c0 === '#' && c1 === '!' && peek(P.L, 2) === '}') {
advance(P.L)
advance(P.L)
return out
}
if (c0 === '!' && c1 === '#') {
// ${!#} ${!##} with optional trailing space then }
let j = 2
if (peek(P.L, j) === '#') j++
if (peek(P.L, j) === ' ') j++
if (peek(P.L, j) === '}') {
while (j-- > 0) advance(P.L)
return out
}
}
}
// Optional # prefix for length
if (peek(P.L) === '#') {
const s = P.L.b
advance(P.L)
out.push(mk(P, '#', s, P.L.b, []))
}
// Optional ! prefix for indirect expansion: ${!varname} ${!prefix*} ${!prefix@}
// Only when followed by an identifier β ${!} alone is special var $!
// Also = ~ prefixes (zsh-style ${=var} ${~var})
const pc = peek(P.L)
if (
(pc === '!' || pc === '=' || pc === '~') &&
(isIdentStart(peek(P.L, 1)) || isDigit(peek(P.L, 1)))
) {
const s = P.L.b
advance(P.L)
out.push(mk(P, pc, s, P.L.b, []))
}
skipBlanks(P.L)
// Variable name
if (isIdentStart(peek(P.L))) {
const s = P.L.b
while (isIdentChar(peek(P.L))) advance(P.L)
out.push(mk(P, 'variable_name', s, P.L.b, []))
} else if (isDigit(peek(P.L))) {
const s = P.L.b
while (isDigit(peek(P.L))) advance(P.L)
out.push(mk(P, 'variable_name', s, P.L.b, []))
} else if (SPECIAL_VARS.has(peek(P.L))) {
const s = P.L.b
advance(P.L)
out.push(mk(P, 'special_variable_name', s, P.L.b, []))
}
// Optional subscript [idx] β parsed arithmetically
if (peek(P.L) === '[') {
const varNode = out[out.length - 1]
const brOpen = P.L.b
advance(P.L)
const brOpenNode = mk(P, '[', brOpen, P.L.b, [])
const idx = parseSubscriptIndexInline(P)
skipBlanks(P.L)
const brClose = P.L.b
if (peek(P.L) === ']') advance(P.L)
const brCloseNode = mk(P, ']', brClose, P.L.b, [])
if (varNode) {
const kids = idx
? [varNode, brOpenNode, idx, brCloseNode]
: [varNode, brOpenNode, brCloseNode]
out[out.length - 1] = mk(P, 'subscript', varNode.startIndex, P.L.b, kids)
}
}
skipBlanks(P.L)
// Trailing * or @ for indirect expansion (${!prefix*} ${!prefix@}) or
// @operator for parameter transformation (${var@U} ${var@Q}) β anonymous
const tc = peek(P.L)
if ((tc === '*' || tc === '@') && peek(P.L, 1) === '}') {
const s = P.L.b
advance(P.L)
out.push(mk(P, tc, s, P.L.b, []))
return out
}
if (tc === '@' && isIdentStart(peek(P.L, 1))) {
// ${var@U} transformation β @ is anonymous, consume op char(s)
const s = P.L.b
advance(P.L)
out.push(mk(P, '@', s, P.L.b, []))
while (isIdentChar(peek(P.L))) advance(P.L)
return out
}
// Operator :- := :? :+ - = ? + # ## % %% / // ^ ^^ , ,, etc.
const c = peek(P.L)
// Bare `:` substring operator ${var:off:len} β offset and length parsed
// arithmetically. Must come BEFORE the generic operator handling so `(` after
// `:` goes to parenthesized_expression not the array path. `:-` `:=` `:?`
// `:+` (no space) remain default-value operators; `: -1` (with space before
// -1) is substring with negative offset.
if (c === ':') {
const c1 = peek(P.L, 1)
// `:\n` or `:}` β empty substring expansion, emits nothing (variable_name only)
if (c1 === '\n' || c1 === '}') {
advance(P.L)
while (peek(P.L) === '\n') advance(P.L)
return out
}
if (c1 !== '-' && c1 !== '=' && c1 !== '?' && c1 !== '+') {
advance(P.L)
skipBlanks(P.L)
// Offset β arithmetic. `-N` at top level is a single number node per
// tree-sitter; inside parens it's unary_expression(number).
const offC = peek(P.L)
let off: TsNode | null
if (offC === '-' && isDigit(peek(P.L, 1))) {
const ns = P.L.b
advance(P.L)
while (isDigit(peek(P.L))) advance(P.L)
off = mk(P, 'number', ns, P.L.b, [])
} else {
off = parseArithExpr(P, ':}', 'var')
}
if (off) out.push(off)
skipBlanks(P.L)
if (peek(P.L) === ':') {
advance(P.L)
skipBlanks(P.L)
const lenC = peek(P.L)
let len: TsNode | null
if (lenC === '-' && isDigit(peek(P.L, 1))) {
const ns = P.L.b
advance(P.L)
while (isDigit(peek(P.L))) advance(P.L)
len = mk(P, 'number', ns, P.L.b, [])
} else {
len = parseArithExpr(P, '}', 'var')
}
if (len) out.push(len)
}
return out
}
}
if (
c === ':' ||
c === '#' ||
c === '%' ||
c === '/' ||
c === '^' ||
c === ',' ||
c === '-' ||
c === '=' ||
c === '?' ||
c === '+'
) {
const s = P.L.b
const c1 = peek(P.L, 1)
let op = c
if (c === ':' && (c1 === '-' || c1 === '=' || c1 === '?' || c1 === '+')) {
advance(P.L)
advance(P.L)
op = c + c1
} else if (
(c === '#' || c === '%' || c === '/' || c === '^' || c === ',') &&
c1 === c
) {
// Doubled operators: ## %% // ^^ ,,
advance(P.L)
advance(P.L)
op = c + c
} else {
advance(P.L)
}
out.push(mk(P, op, s, P.L.b, []))
// Rest is the default/replacement β parse as word or regex until }
// Pattern-matching operators (# ## % %% / // ^ ^^ , ,,) emit regex;
// value-substitution operators (:- := :? :+ - = ? + :) emit word.
// `/` and `//` split at next `/` into (regex)+(word) for pat/repl.
const isPattern =
op === '#' ||
op === '##' ||
op === '%' ||
op === '%%' ||
op === '/' ||
op === '//' ||
op === '^' ||
op === '^^' ||
op === ',' ||
op === ',,'
if (op === '/' || op === '//') {
// Optional /# or /% anchor prefix β anonymous node
const ac = peek(P.L)
if (ac === '#' || ac === '%') {
const aStart = P.L.b
advance(P.L)
out.push(mk(P, ac, aStart, P.L.b, []))
}
// Pattern: per grammar _expansion_regex_replacement, pattern is
// choice(regex, string, cmd_sub, seq(string, regex)). If it STARTS
// with ", emit (string) and any trailing chars become (regex).
// `${v//"${old}"/}` β (string(expansion)); `${v//"${c}"\//}` β
// (string)(regex).
if (peek(P.L) === '"') {
out.push(parseDoubleQuoted(P))
const tail = parseExpansionRest(P, 'regex', true)
if (tail) out.push(tail)
} else {
const regex = parseExpansionRest(P, 'regex', true)
if (regex) out.push(regex)
}
if (peek(P.L) === '/') {
const sepStart = P.L.b
advance(P.L)
out.push(mk(P, '/', sepStart, P.L.b, []))
// Replacement: per grammar, choice includes `seq(cmd_sub, word)`
// which emits TWO siblings (not concatenation). Also `(` at start
// of replacement is a regular word char, NOT array β unlike `:-`
// default-value context. `${v/(/(Gentoo ${x}, }` replacement
// `(Gentoo ${x}, ` is (concatenation (word)(expansion)(word)).
const repl = parseExpansionRest(P, 'replword', false)
if (repl) {
// seq(cmd_sub, word) special case β siblings. Detected when
// replacement is a concatenation of exactly 2 parts with first
// being command_substitution.
if (
repl.type === 'concatenation' &&
repl.children.length === 2 &&
repl.children[0]!.type === 'command_substitution'
) {
out.push(repl.children[0]!)
out.push(repl.children[1]!)
} else {
out.push(repl)
}
}
}
} else if (op === '#' || op === '##' || op === '%' || op === '%%') {
// Pattern-removal: per grammar _expansion_regex, pattern is
// repeat(choice(regex, string, raw_string, ')')). Each quote/string
// is a SIBLING, not absorbed into one regex. `${f%'str'*}` β
// (raw_string)(regex); `${f/'str'*}` (slash) stays single regex.
for (const p of parseExpansionRegexSegmented(P)) out.push(p)
} else {
const rest = parseExpansionRest(P, isPattern ? 'regex' : 'word', false)
if (rest) out.push(rest)
}
}
return out
}
function parseExpansionRest(
P: ParseState,
nodeType: string,
stopAtSlash: boolean,
): TsNode | null {
// Don't skipBlanks β `${var:- }` space IS the word. Stop at } or newline
// (`${var:\n}` emits no word). stopAtSlash=true stops at `/` for pat/repl
// split in ${var/pat/repl}. nodeType 'replword' is word-mode for the
// replacement in `/` `//` β same as 'word' but `(` is NOT array.
const start = P.L.b
// Value-substitution RHS starting with `(` parses as array: ${var:-(x)} β
// (expansion (variable_name) (array (word))). Only for 'word' context (not
// pattern-matching operators which emit regex, and not 'replword' where `(`
// is a regular char per grammar `_expansion_regex_replacement`).
if (nodeType === 'word' && peek(P.L) === '(') {
advance(P.L)
const open = mk(P, '(', start, P.L.b, [])
const elems: TsNode[] = [open]
while (P.L.i < P.L.len) {
skipBlanks(P.L)
const c = peek(P.L)
if (c === ')' || c === '}' || c === '\n' || c === '') break
const wStart = P.L.b
while (P.L.i < P.L.len) {
const wc = peek(P.L)
if (
wc === ')' ||
wc === '}' ||
wc === ' ' ||
wc === '\t' ||
wc === '\n' ||
wc === ''
) {
break
}
advance(P.L)
}
if (P.L.b > wStart) elems.push(mk(P, 'word', wStart, P.L.b, []))
else break
}
if (peek(P.L) === ')') {
const cStart = P.L.b
advance(P.L)
elems.push(mk(P, ')', cStart, P.L.b, []))
}
while (peek(P.L) === '\n') advance(P.L)
return mk(P, 'array', start, P.L.b, elems)
}
// REGEX mode: flat single-span scan. Quotes are opaque (skipped past so
// `/` inside them doesn't break stopAtSlash), but NOT emitted as separate
// nodes β the entire range becomes one regex node.
if (nodeType === 'regex') {
let braceDepth = 0
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\n') break
if (braceDepth === 0) {
if (c === '}') break
if (stopAtSlash && c === '/') break
}
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
if (c === '"' || c === "'") {
advance(P.L)
while (P.L.i < P.L.len && peek(P.L) !== c) {
if (peek(P.L) === '\\' && P.L.i + 1 < P.L.len) advance(P.L)
advance(P.L)
}
if (peek(P.L) === c) advance(P.L)
continue
}
// Skip past nested ${...} $(...) $[...] so their } / don't terminate us
if (c === '$') {
const c1 = peek(P.L, 1)
if (c1 === '{') {
let d = 0
advance(P.L)
advance(P.L)
d++
while (P.L.i < P.L.len && d > 0) {
const nc = peek(P.L)
if (nc === '{') d++
else if (nc === '}') d--
advance(P.L)
}
continue
}
if (c1 === '(') {
let d = 0
advance(P.L)
advance(P.L)
d++
while (P.L.i < P.L.len && d > 0) {
const nc = peek(P.L)
if (nc === '(') d++
else if (nc === ')') d--
advance(P.L)
}
continue
}
}
if (c === '{') braceDepth++
else if (c === '}' && braceDepth > 0) braceDepth--
advance(P.L)
}
const end = P.L.b
while (peek(P.L) === '\n') advance(P.L)
if (end === start) return null
return mk(P, 'regex', start, end, [])
}
// WORD mode: segmenting parser β recognize nested ${...}, $(...), $'...',
// "...", '...', $ident, <(...)/>(...); bare chars accumulate into word
// segments. Multiple parts β wrapped in concatenation.
const parts: TsNode[] = []
let segStart = P.L.b
let braceDepth = 0
const flushSeg = (): void => {
if (P.L.b > segStart) {
parts.push(mk(P, 'word', segStart, P.L.b, []))
}
}
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\n') break
if (braceDepth === 0) {
if (c === '}') break
if (stopAtSlash && c === '/') break
}
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
const c1 = peek(P.L, 1)
if (c === '$') {
if (c1 === '{' || c1 === '(' || c1 === '[') {
flushSeg()
const exp = parseDollarLike(P)
if (exp) parts.push(exp)
segStart = P.L.b
continue
}
if (c1 === "'") {
// $'...' ANSI-C string
flushSeg()
const aStart = P.L.b
advance(P.L)
advance(P.L)
while (P.L.i < P.L.len && peek(P.L) !== "'") {
if (peek(P.L) === '\\' && P.L.i + 1 < P.L.len) advance(P.L)
advance(P.L)
}
if (peek(P.L) === "'") advance(P.L)
parts.push(mk(P, 'ansi_c_string', aStart, P.L.b, []))
segStart = P.L.b
continue
}
if (isIdentStart(c1) || isDigit(c1) || SPECIAL_VARS.has(c1)) {
flushSeg()
const exp = parseDollarLike(P)
if (exp) parts.push(exp)
segStart = P.L.b
continue
}
}
if (c === '"') {
flushSeg()
parts.push(parseDoubleQuoted(P))
segStart = P.L.b
continue
}
if (c === "'") {
flushSeg()
const rStart = P.L.b
advance(P.L)
while (P.L.i < P.L.len && peek(P.L) !== "'") advance(P.L)
if (peek(P.L) === "'") advance(P.L)
parts.push(mk(P, 'raw_string', rStart, P.L.b, []))
segStart = P.L.b
continue
}
if ((c === '<' || c === '>') && c1 === '(') {
flushSeg()
const ps = parseProcessSub(P)
if (ps) parts.push(ps)
segStart = P.L.b
continue
}
if (c === '`') {
flushSeg()
const bt = parseBacktick(P)
if (bt) parts.push(bt)
segStart = P.L.b
continue
}
// Brace tracking so nested {a,b} brace-expansion chars don't prematurely
// terminate (rare, but the `?` in `${cond}? (` should be treated as word).
if (c === '{') braceDepth++
else if (c === '}' && braceDepth > 0) braceDepth--
advance(P.L)
}
flushSeg()
// Consume trailing newlines before } so caller sees }
while (peek(P.L) === '\n') advance(P.L)
// Tree-sitter skips leading whitespace (extras) in expansion RHS when
// there's content after: `${2+ ${2}}` β just (expansion). But `${v:- }`
// (space-only RHS) keeps the space as (word). So drop leading whitespace-
// only word segment if it's NOT the only part.
if (
parts.length > 1 &&
parts[0]!.type === 'word' &&
/^[ \t]+$/.test(parts[0]!.text)
) {
parts.shift()
}
if (parts.length === 0) return null
if (parts.length === 1) return parts[0]!
// Multiple parts: wrap in concatenation (word mode keeps concat wrapping;
// regex mode also concats per tree-sitter for mixed quote+glob patterns).
const last = parts[parts.length - 1]!
return mk(P, 'concatenation', parts[0]!.startIndex, last.endIndex, parts)
}
// Pattern for # ## % %% operators β per grammar _expansion_regex:
// repeat(choice(regex, string, raw_string, ')', /\s+/βregex)). Each quote
// becomes a SIBLING node, not absorbed. `${f%'str'*}` β (raw_string)(regex).
function parseExpansionRegexSegmented(P: ParseState): TsNode[] {
const out: TsNode[] = []
let segStart = P.L.b
const flushRegex = (): void => {
if (P.L.b > segStart) out.push(mk(P, 'regex', segStart, P.L.b, []))
}
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '}' || c === '\n') break
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
if (c === '"') {
flushRegex()
out.push(parseDoubleQuoted(P))
segStart = P.L.b
continue
}
if (c === "'") {
flushRegex()
const rStart = P.L.b
advance(P.L)
while (P.L.i < P.L.len && peek(P.L) !== "'") advance(P.L)
if (peek(P.L) === "'") advance(P.L)
out.push(mk(P, 'raw_string', rStart, P.L.b, []))
segStart = P.L.b
continue
}
// Nested ${...} $(...) β opaque scan so their } doesn't terminate us
if (c === '$') {
const c1 = peek(P.L, 1)
if (c1 === '{') {
let d = 1
advance(P.L)
advance(P.L)
while (P.L.i < P.L.len && d > 0) {
const nc = peek(P.L)
if (nc === '{') d++
else if (nc === '}') d--
advance(P.L)
}
continue
}
if (c1 === '(') {
let d = 1
advance(P.L)
advance(P.L)
while (P.L.i < P.L.len && d > 0) {
const nc = peek(P.L)
if (nc === '(') d++
else if (nc === ')') d--
advance(P.L)
}
continue
}
}
advance(P.L)
}
flushRegex()
while (peek(P.L) === '\n') advance(P.L)
return out
}
function parseBacktick(P: ParseState): TsNode | null {
const start = P.L.b
advance(P.L)
const open = mk(P, '`', start, P.L.b, [])
P.inBacktick++
// Parse statements inline β stop at closing backtick
const body: TsNode[] = []
while (true) {
skipBlanks(P.L)
if (peek(P.L) === '`' || peek(P.L) === '') break
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'EOF' || t.type === 'BACKTICK') {
restoreLex(P.L, save)
break
}
if (t.type === 'NEWLINE') continue
restoreLex(P.L, save)
const stmt = parseAndOr(P)
if (!stmt) break
body.push(stmt)
skipBlanks(P.L)
if (peek(P.L) === '`') break
const save2 = saveLex(P.L)
const sep = nextToken(P.L, 'cmd')
if (sep.type === 'OP' && (sep.value === ';' || sep.value === '&')) {
body.push(leaf(P, sep.value, sep))
} else if (sep.type !== 'NEWLINE') {
restoreLex(P.L, save2)
}
}
P.inBacktick--
let close: TsNode
if (peek(P.L) === '`') {
const cStart = P.L.b
advance(P.L)
close = mk(P, '`', cStart, P.L.b, [])
} else {
close = mk(P, '`', P.L.b, P.L.b, [])
}
// Empty backticks (whitespace/newline only) are elided entirely by
// tree-sitter β used as a line-continuation hack: "foo"`<newline>`"bar"
// β (concatenation (string) (string)) with no command_substitution.
if (body.length === 0) return null
return mk(P, 'command_substitution', start, close.endIndex, [
open,
...body,
close,
])
}
function parseIf(P: ParseState, ifTok: Token): TsNode {
const ifKw = leaf(P, 'if', ifTok)
const kids: TsNode[] = [ifKw]
const cond = parseStatements(P, null)
kids.push(...cond)
consumeKeyword(P, 'then', kids)
const body = parseStatements(P, null)
kids.push(...body)
while (true) {
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'WORD' && t.value === 'elif') {
const eKw = leaf(P, 'elif', t)
const eCond = parseStatements(P, null)
const eKids: TsNode[] = [eKw, ...eCond]
consumeKeyword(P, 'then', eKids)
const eBody = parseStatements(P, null)
eKids.push(...eBody)
const last = eKids[eKids.length - 1]!
kids.push(mk(P, 'elif_clause', eKw.startIndex, last.endIndex, eKids))
} else if (t.type === 'WORD' && t.value === 'else') {
const elKw = leaf(P, 'else', t)
const elBody = parseStatements(P, null)
const last = elBody.length > 0 ? elBody[elBody.length - 1]! : elKw
kids.push(
mk(P, 'else_clause', elKw.startIndex, last.endIndex, [elKw, ...elBody]),
)
} else {
restoreLex(P.L, save)
break
}
}
consumeKeyword(P, 'fi', kids)
const last = kids[kids.length - 1]!
return mk(P, 'if_statement', ifKw.startIndex, last.endIndex, kids)
}
function parseWhile(P: ParseState, kwTok: Token): TsNode {
const kw = leaf(P, kwTok.value, kwTok)
const kids: TsNode[] = [kw]
const cond = parseStatements(P, null)
kids.push(...cond)
const dg = parseDoGroup(P)
if (dg) kids.push(dg)
const last = kids[kids.length - 1]!
return mk(P, 'while_statement', kw.startIndex, last.endIndex, kids)
}
function parseFor(P: ParseState, forTok: Token): TsNode {
const forKw = leaf(P, forTok.value, forTok)
skipBlanks(P.L)
// C-style for (( ; ; )) β only for `for`, not `select`
if (forTok.value === 'for' && peek(P.L) === '(' && peek(P.L, 1) === '(') {
const oStart = P.L.b
advance(P.L)
advance(P.L)
const open = mk(P, '((', oStart, P.L.b, [])
const kids: TsNode[] = [forKw, open]
// init; cond; update β all three use 'assign' mode so `c = expr` emits
// variable_assignment, while bare idents (c in `c<=5`) β word. Each
// clause may be a comma-separated list.
for (let k = 0; k < 3; k++) {
skipBlanks(P.L)
const es = parseArithCommaList(P, k < 2 ? ';' : '))', 'assign')
kids.push(...es)
if (k < 2) {
if (peek(P.L) === ';') {
const s = P.L.b
advance(P.L)
kids.push(mk(P, ';', s, P.L.b, []))
}
}
}
skipBlanks(P.L)
if (peek(P.L) === ')' && peek(P.L, 1) === ')') {
const cStart = P.L.b
advance(P.L)
advance(P.L)
kids.push(mk(P, '))', cStart, P.L.b, []))
}
// Optional ; or newline
const save = saveLex(P.L)
const sep = nextToken(P.L, 'cmd')
if (sep.type === 'OP' && sep.value === ';') {
kids.push(leaf(P, ';', sep))
} else if (sep.type !== 'NEWLINE') {
restoreLex(P.L, save)
}
const dg = parseDoGroup(P)
if (dg) {
kids.push(dg)
} else {
// C-style for can also use `{ ... }` body instead of `do ... done`
skipNewlines(P)
skipBlanks(P.L)
if (peek(P.L) === '{') {
const bOpen = P.L.b
advance(P.L)
const brace = mk(P, '{', bOpen, P.L.b, [])
const body = parseStatements(P, '}')
let bClose: TsNode
if (peek(P.L) === '}') {
const cs = P.L.b
advance(P.L)
bClose = mk(P, '}', cs, P.L.b, [])
} else {
bClose = mk(P, '}', P.L.b, P.L.b, [])
}
kids.push(
mk(P, 'compound_statement', brace.startIndex, bClose.endIndex, [
brace,
...body,
bClose,
]),
)
}
}
const last = kids[kids.length - 1]!
return mk(P, 'c_style_for_statement', forKw.startIndex, last.endIndex, kids)
}
// Regular for VAR in words; do ... done
const kids: TsNode[] = [forKw]
const varTok = nextToken(P.L, 'arg')
kids.push(mk(P, 'variable_name', varTok.start, varTok.end, []))
skipBlanks(P.L)
const save = saveLex(P.L)
const inTok = nextToken(P.L, 'arg')
if (inTok.type === 'WORD' && inTok.value === 'in') {
kids.push(leaf(P, 'in', inTok))
while (true) {
skipBlanks(P.L)
const c = peek(P.L)
if (c === ';' || c === '\n' || c === '') break
const w = parseWord(P, 'arg')
if (!w) break
kids.push(w)
}
} else {
restoreLex(P.L, save)
}
// Separator
const save2 = saveLex(P.L)
const sep = nextToken(P.L, 'cmd')
if (sep.type === 'OP' && sep.value === ';') {
kids.push(leaf(P, ';', sep))
} else if (sep.type !== 'NEWLINE') {
restoreLex(P.L, save2)
}
const dg = parseDoGroup(P)
if (dg) kids.push(dg)
const last = kids[kids.length - 1]!
return mk(P, 'for_statement', forKw.startIndex, last.endIndex, kids)
}
function parseDoGroup(P: ParseState): TsNode | null {
skipNewlines(P)
const save = saveLex(P.L)
const doTok = nextToken(P.L, 'cmd')
if (doTok.type !== 'WORD' || doTok.value !== 'do') {
restoreLex(P.L, save)
return null
}
const doKw = leaf(P, 'do', doTok)
const body = parseStatements(P, null)
const kids: TsNode[] = [doKw, ...body]
consumeKeyword(P, 'done', kids)
const last = kids[kids.length - 1]!
return mk(P, 'do_group', doKw.startIndex, last.endIndex, kids)
}
function parseCase(P: ParseState, caseTok: Token): TsNode {
const caseKw = leaf(P, 'case', caseTok)
const kids: TsNode[] = [caseKw]
skipBlanks(P.L)
const word = parseWord(P, 'arg')
if (word) kids.push(word)
skipBlanks(P.L)
consumeKeyword(P, 'in', kids)
skipNewlines(P)
while (true) {
skipBlanks(P.L)
skipNewlines(P)
const save = saveLex(P.L)
const t = nextToken(P.L, 'arg')
if (t.type === 'WORD' && t.value === 'esac') {
kids.push(leaf(P, 'esac', t))
break
}
if (t.type === 'EOF') break
restoreLex(P.L, save)
const item = parseCaseItem(P)
if (!item) break
kids.push(item)
}
const last = kids[kids.length - 1]!
return mk(P, 'case_statement', caseKw.startIndex, last.endIndex, kids)
}
function parseCaseItem(P: ParseState): TsNode | null {
skipBlanks(P.L)
const start = P.L.b
const kids: TsNode[] = []
// Optional leading '(' before pattern β bash allows (pattern) syntax
if (peek(P.L) === '(') {
const s = P.L.b
advance(P.L)
kids.push(mk(P, '(', s, P.L.b, []))
}
// Pattern(s)
let isFirstAlt = true
while (true) {
skipBlanks(P.L)
const c = peek(P.L)
if (c === ')' || c === '') break
const pats = parseCasePattern(P)
if (pats.length === 0) break
// tree-sitter quirk: first alternative with quotes is inlined as flat
// siblings; subsequent alternatives are wrapped in (concatenation) with
// `word` instead of `extglob_pattern` for bare segments.
if (!isFirstAlt && pats.length > 1) {
const rewritten = pats.map(p =>
p.type === 'extglob_pattern'
? mk(P, 'word', p.startIndex, p.endIndex, [])
: p,
)
const first = rewritten[0]!
const last = rewritten[rewritten.length - 1]!
kids.push(
mk(P, 'concatenation', first.startIndex, last.endIndex, rewritten),
)
} else {
kids.push(...pats)
}
isFirstAlt = false
skipBlanks(P.L)
// \<newline> line continuation between alternatives
if (peek(P.L) === '\\' && peek(P.L, 1) === '\n') {
advance(P.L)
advance(P.L)
skipBlanks(P.L)
}
if (peek(P.L) === '|') {
const s = P.L.b
advance(P.L)
kids.push(mk(P, '|', s, P.L.b, []))
// \<newline> after | is also a line continuation
if (peek(P.L) === '\\' && peek(P.L, 1) === '\n') {
advance(P.L)
advance(P.L)
}
} else {
break
}
}
if (peek(P.L) === ')') {
const s = P.L.b
advance(P.L)
kids.push(mk(P, ')', s, P.L.b, []))
}
const body = parseStatements(P, null)
kids.push(...body)
const save = saveLex(P.L)
const term = nextToken(P.L, 'cmd')
if (
term.type === 'OP' &&
(term.value === ';;' || term.value === ';&' || term.value === ';;&')
) {
kids.push(leaf(P, term.value, term))
} else {
restoreLex(P.L, save)
}
if (kids.length === 0) return null
// tree-sitter quirk: case_item with EMPTY body and a single pattern matching
// extglob-operator-char-prefix (no actual glob metachars) downgrades to word.
// `-o) owner=$2 ;;` (has body) β extglob_pattern; `-g) ;;` (empty) β word.
if (body.length === 0) {
for (let i = 0; i < kids.length; i++) {
const k = kids[i]!
if (k.type !== 'extglob_pattern') continue
const text = sliceBytes(P, k.startIndex, k.endIndex)
if (/^[-+?*@!][a-zA-Z]/.test(text) && !/[*?(]/.test(text)) {
kids[i] = mk(P, 'word', k.startIndex, k.endIndex, [])
}
}
}
const last = kids[kids.length - 1]!
return mk(P, 'case_item', start, last.endIndex, kids)
}
function parseCasePattern(P: ParseState): TsNode[] {
skipBlanks(P.L)
const save = saveLex(P.L)
const start = P.L.b
const startI = P.L.i
let parenDepth = 0
let hasDollar = false
let hasBracketOutsideParen = false
let hasQuote = false
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\\' && P.L.i + 1 < P.L.len) {
// Escaped char β consume both (handles `bar\ baz` as single pattern)
// \<newline> is a line continuation; eat it but stay in pattern.
advance(P.L)
advance(P.L)
continue
}
if (c === '"' || c === "'") {
hasQuote = true
// Skip past the quoted segment so its content (spaces, |, etc.) doesn't
// break the peek-ahead scan.
advance(P.L)
while (P.L.i < P.L.len && peek(P.L) !== c) {
if (peek(P.L) === '\\' && P.L.i + 1 < P.L.len) advance(P.L)
advance(P.L)
}
if (peek(P.L) === c) advance(P.L)
continue
}
// Paren counting: any ( inside pattern opens a scope; don't break at ) or |
// until balanced. Handles extglob *(a|b) and nested shapes *([0-9])([0-9]).
if (c === '(') {
parenDepth++
advance(P.L)
continue
}
if (parenDepth > 0) {
if (c === ')') {
parenDepth--
advance(P.L)
continue
}
if (c === '\n') break
advance(P.L)
continue
}
if (c === ')' || c === '|' || c === ' ' || c === '\t' || c === '\n') break
if (c === '$') hasDollar = true
if (c === '[') hasBracketOutsideParen = true
advance(P.L)
}
if (P.L.b === start) return []
const text = P.src.slice(startI, P.L.i)
const hasExtglobParen = /[*?+@!]\(/.test(text)
// Quoted segments in pattern: tree-sitter splits at quote boundaries into
// multiple sibling nodes. `*"foo"*` β (extglob_pattern)(string)(extglob_pattern).
// Re-scan with a segmenting pass.
if (hasQuote && !hasExtglobParen) {
restoreLex(P.L, save)
return parseCasePatternSegmented(P)
}
// tree-sitter splits patterns with [ or $ into concatenation via word parsing
// UNLESS pattern has extglob parens (those override and emit extglob_pattern).
// `*.[1357]` β concat(word word number word); `${PN}.pot` β concat(expansion word);
// but `*([0-9])` β extglob_pattern (has extglob paren).
if (!hasExtglobParen && (hasDollar || hasBracketOutsideParen)) {
restoreLex(P.L, save)
const w = parseWord(P, 'arg')
return w ? [w] : []
}
// Patterns starting with extglob operator chars (+ - ? * @ !) followed by
// identifier chars are extglob_pattern per tree-sitter, even without parens
// or glob metachars. `-o)` β extglob_pattern; plain `foo)` β word.
const type =
hasExtglobParen || /[*?]/.test(text) || /^[-+?*@!][a-zA-Z]/.test(text)
? 'extglob_pattern'
: 'word'
return [mk(P, type, start, P.L.b, [])]
}
// Segmented scan for case patterns containing quotes: `*"foo"*` β
// [extglob_pattern, string, extglob_pattern]. Bare segments β extglob_pattern
// if they have */?, else word. Stops at ) | space tab newline outside quotes.
function parseCasePatternSegmented(P: ParseState): TsNode[] {
const parts: TsNode[] = []
let segStart = P.L.b
let segStartI = P.L.i
const flushSeg = (): void => {
if (P.L.i > segStartI) {
const t = P.src.slice(segStartI, P.L.i)
const type = /[*?]/.test(t) ? 'extglob_pattern' : 'word'
parts.push(mk(P, type, segStart, P.L.b, []))
}
}
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
if (c === '"') {
flushSeg()
parts.push(parseDoubleQuoted(P))
segStart = P.L.b
segStartI = P.L.i
continue
}
if (c === "'") {
flushSeg()
const tok = nextToken(P.L, 'arg')
parts.push(leaf(P, 'raw_string', tok))
segStart = P.L.b
segStartI = P.L.i
continue
}
if (c === ')' || c === '|' || c === ' ' || c === '\t' || c === '\n') break
advance(P.L)
}
flushSeg()
return parts
}
function parseFunction(P: ParseState, fnTok: Token): TsNode {
const fnKw = leaf(P, 'function', fnTok)
skipBlanks(P.L)
const nameTok = nextToken(P.L, 'arg')
const name = mk(P, 'word', nameTok.start, nameTok.end, [])
const kids: TsNode[] = [fnKw, name]
skipBlanks(P.L)
if (peek(P.L) === '(' && peek(P.L, 1) === ')') {
const o = nextToken(P.L, 'cmd')
const c = nextToken(P.L, 'cmd')
kids.push(leaf(P, '(', o))
kids.push(leaf(P, ')', c))
}
skipBlanks(P.L)
skipNewlines(P)
const body = parseCommand(P)
if (body) {
// Hoist redirects from redirected_statement(compound_statement, ...) to
// function_definition level per tree-sitter grammar
if (
body.type === 'redirected_statement' &&
body.children.length >= 2 &&
body.children[0]!.type === 'compound_statement'
) {
kids.push(...body.children)
} else {
kids.push(body)
}
}
const last = kids[kids.length - 1]!
return mk(P, 'function_definition', fnKw.startIndex, last.endIndex, kids)
}
function parseDeclaration(P: ParseState, kwTok: Token): TsNode {
const kw = leaf(P, kwTok.value, kwTok)
const kids: TsNode[] = [kw]
while (true) {
skipBlanks(P.L)
const c = peek(P.L)
if (
c === '' ||
c === '\n' ||
c === ';' ||
c === '&' ||
c === '|' ||
c === ')' ||
c === '<' ||
c === '>'
) {
break
}
const a = tryParseAssignment(P)
if (a) {
kids.push(a)
continue
}
// Quoted string or concatenation: `export "FOO=bar"`, `export 'X'`
if (c === '"' || c === "'" || c === '$') {
const w = parseWord(P, 'arg')
if (w) {
kids.push(w)
continue
}
break
}
// Flag like -a or bare variable name
const save = saveLex(P.L)
const tok = nextToken(P.L, 'arg')
if (tok.type === 'WORD' || tok.type === 'NUMBER') {
if (tok.value.startsWith('-')) {
kids.push(leaf(P, 'word', tok))
} else if (isIdentStart(tok.value[0] ?? '')) {
kids.push(mk(P, 'variable_name', tok.start, tok.end, []))
} else {
kids.push(leaf(P, 'word', tok))
}
} else {
restoreLex(P.L, save)
break
}
}
const last = kids[kids.length - 1]!
return mk(P, 'declaration_command', kw.startIndex, last.endIndex, kids)
}
function parseUnset(P: ParseState, kwTok: Token): TsNode {
const kw = leaf(P, 'unset', kwTok)
const kids: TsNode[] = [kw]
while (true) {
skipBlanks(P.L)
const c = peek(P.L)
if (
c === '' ||
c === '\n' ||
c === ';' ||
c === '&' ||
c === '|' ||
c === ')' ||
c === '<' ||
c === '>'
) {
break
}
// SECURITY: use parseWord (not raw nextToken) so quoted strings like
// `unset 'a[$(id)]'` emit a raw_string child that ast.ts can reject.
// Previously `break` silently dropped non-WORD args β hiding the
// arithmetic-subscript code-exec vector from the security walker.
const arg = parseWord(P, 'arg')
if (!arg) break
if (arg.type === 'word') {
if (arg.text.startsWith('-')) {
kids.push(arg)
} else {
kids.push(mk(P, 'variable_name', arg.startIndex, arg.endIndex, []))
}
} else {
kids.push(arg)
}
}
const last = kids[kids.length - 1]!
return mk(P, 'unset_command', kw.startIndex, last.endIndex, kids)
}
function consumeKeyword(P: ParseState, name: string, kids: TsNode[]): void {
skipNewlines(P)
const save = saveLex(P.L)
const t = nextToken(P.L, 'cmd')
if (t.type === 'WORD' && t.value === name) {
kids.push(leaf(P, name, t))
} else {
restoreLex(P.L, save)
}
}
// βββββββββββββββββββββ Test & Arithmetic Expressions βββββββββββββββββββββ
function parseTestExpr(P: ParseState, closer: string): TsNode | null {
return parseTestOr(P, closer)
}
function parseTestOr(P: ParseState, closer: string): TsNode | null {
let left = parseTestAnd(P, closer)
if (!left) return null
while (true) {
skipBlanks(P.L)
const save = saveLex(P.L)
if (peek(P.L) === '|' && peek(P.L, 1) === '|') {
const s = P.L.b
advance(P.L)
advance(P.L)
const op = mk(P, '||', s, P.L.b, [])
const right = parseTestAnd(P, closer)
if (!right) {
restoreLex(P.L, save)
break
}
left = mk(P, 'binary_expression', left.startIndex, right.endIndex, [
left,
op,
right,
])
} else {
break
}
}
return left
}
function parseTestAnd(P: ParseState, closer: string): TsNode | null {
let left = parseTestUnary(P, closer)
if (!left) return null
while (true) {
skipBlanks(P.L)
if (peek(P.L) === '&' && peek(P.L, 1) === '&') {
const s = P.L.b
advance(P.L)
advance(P.L)
const op = mk(P, '&&', s, P.L.b, [])
const right = parseTestUnary(P, closer)
if (!right) break
left = mk(P, 'binary_expression', left.startIndex, right.endIndex, [
left,
op,
right,
])
} else {
break
}
}
return left
}
function parseTestUnary(P: ParseState, closer: string): TsNode | null {
skipBlanks(P.L)
const c = peek(P.L)
if (c === '(') {
const s = P.L.b
advance(P.L)
const open = mk(P, '(', s, P.L.b, [])
const inner = parseTestOr(P, closer)
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ')') {
const cs = P.L.b
advance(P.L)
close = mk(P, ')', cs, P.L.b, [])
} else {
close = mk(P, ')', P.L.b, P.L.b, [])
}
const kids = inner ? [open, inner, close] : [open, close]
return mk(
P,
'parenthesized_expression',
open.startIndex,
close.endIndex,
kids,
)
}
return parseTestBinary(P, closer)
}
/**
* Parse `!`-negated or test-operator (`-f`) or parenthesized primary β but NOT
* a binary comparison. Used as LHS of binary_expression so `! x =~ y` binds
* `!` to `x` only, not the whole `x =~ y`.
*/
function parseTestNegatablePrimary(
P: ParseState,
closer: string,
): TsNode | null {
skipBlanks(P.L)
const c = peek(P.L)
if (c === '!') {
const s = P.L.b
advance(P.L)
const bang = mk(P, '!', s, P.L.b, [])
const inner = parseTestNegatablePrimary(P, closer)
if (!inner) return bang
return mk(P, 'unary_expression', bang.startIndex, inner.endIndex, [
bang,
inner,
])
}
if (c === '-' && isIdentStart(peek(P.L, 1))) {
const s = P.L.b
advance(P.L)
while (isIdentChar(peek(P.L))) advance(P.L)
const op = mk(P, 'test_operator', s, P.L.b, [])
skipBlanks(P.L)
const arg = parseTestPrimary(P, closer)
if (!arg) return op
return mk(P, 'unary_expression', op.startIndex, arg.endIndex, [op, arg])
}
return parseTestPrimary(P, closer)
}
function parseTestBinary(P: ParseState, closer: string): TsNode | null {
skipBlanks(P.L)
// `!` in test context binds tighter than =~/==.
// `[[ ! "x" =~ y ]]` β (binary_expression (unary_expression (string)) (regex))
// `[[ ! -f x ]]` β (unary_expression ! (unary_expression (test_operator) (word)))
const left = parseTestNegatablePrimary(P, closer)
if (!left) return null
skipBlanks(P.L)
// Binary comparison: == != =~ -eq -lt etc.
const c = peek(P.L)
const c1 = peek(P.L, 1)
let op: TsNode | null = null
const os = P.L.b
if (c === '=' && c1 === '=') {
advance(P.L)
advance(P.L)
op = mk(P, '==', os, P.L.b, [])
} else if (c === '!' && c1 === '=') {
advance(P.L)
advance(P.L)
op = mk(P, '!=', os, P.L.b, [])
} else if (c === '=' && c1 === '~') {
advance(P.L)
advance(P.L)
op = mk(P, '=~', os, P.L.b, [])
} else if (c === '=' && c1 !== '=') {
advance(P.L)
op = mk(P, '=', os, P.L.b, [])
} else if (c === '<' && c1 !== '<') {
advance(P.L)
op = mk(P, '<', os, P.L.b, [])
} else if (c === '>' && c1 !== '>') {
advance(P.L)
op = mk(P, '>', os, P.L.b, [])
} else if (c === '-' && isIdentStart(c1)) {
advance(P.L)
while (isIdentChar(peek(P.L))) advance(P.L)
op = mk(P, 'test_operator', os, P.L.b, [])
}
if (!op) return left
skipBlanks(P.L)
// In [[ ]], RHS of ==/!=/=/=~ gets special pattern parsing: paren counting
// so @(a|b|c) doesn't break on |, and segments become extglob_pattern/regex.
if (closer === ']]') {
const opText = op.type
if (opText === '=~') {
skipBlanks(P.L)
// If the ENTIRE RHS is a quoted string, emit string/raw_string not
// regex: `[[ "$x" =~ "$y" ]]` β (binary_expression (string) (string)).
// If there's content after the quote (`' boop '(.*)$`), the whole RHS
// stays a single (regex). Peek past the quote to check.
const rc = peek(P.L)
let rhs: TsNode | null = null
if (rc === '"' || rc === "'") {
const save = saveLex(P.L)
const quoted =
rc === '"'
? parseDoubleQuoted(P)
: leaf(P, 'raw_string', nextToken(P.L, 'arg'))
// Check if RHS ends here: only whitespace then ]] or &&/|| or newline
let j = P.L.i
while (j < P.L.len && (P.src[j] === ' ' || P.src[j] === '\t')) j++
const nc = P.src[j] ?? ''
const nc1 = P.src[j + 1] ?? ''
if (
(nc === ']' && nc1 === ']') ||
(nc === '&' && nc1 === '&') ||
(nc === '|' && nc1 === '|') ||
nc === '\n' ||
nc === ''
) {
rhs = quoted
} else {
restoreLex(P.L, save)
}
}
if (!rhs) rhs = parseTestRegexRhs(P)
if (!rhs) return left
return mk(P, 'binary_expression', left.startIndex, rhs.endIndex, [
left,
op,
rhs,
])
}
// Single `=` emits (regex) per tree-sitter; `==` and `!=` emit extglob_pattern
if (opText === '=') {
const rhs = parseTestRegexRhs(P)
if (!rhs) return left
return mk(P, 'binary_expression', left.startIndex, rhs.endIndex, [
left,
op,
rhs,
])
}
if (opText === '==' || opText === '!=') {
const parts = parseTestExtglobRhs(P)
if (parts.length === 0) return left
const last = parts[parts.length - 1]!
return mk(P, 'binary_expression', left.startIndex, last.endIndex, [
left,
op,
...parts,
])
}
}
const right = parseTestPrimary(P, closer)
if (!right) return left
return mk(P, 'binary_expression', left.startIndex, right.endIndex, [
left,
op,
right,
])
}
// RHS of =~ in [[ ]] β scan as single (regex) node with paren/bracket counting
// so | ( ) inside the regex don't break parsing. Stop at ]] or ws+&&/||.
function parseTestRegexRhs(P: ParseState): TsNode | null {
skipBlanks(P.L)
const start = P.L.b
let parenDepth = 0
let bracketDepth = 0
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
if (c === '\n') break
if (parenDepth === 0 && bracketDepth === 0) {
if (c === ']' && peek(P.L, 1) === ']') break
if (c === ' ' || c === '\t') {
// Peek past blanks for ]] or &&/||
let j = P.L.i
while (j < P.L.len && (P.L.src[j] === ' ' || P.L.src[j] === '\t')) j++
const nc = P.L.src[j] ?? ''
const nc1 = P.L.src[j + 1] ?? ''
if (
(nc === ']' && nc1 === ']') ||
(nc === '&' && nc1 === '&') ||
(nc === '|' && nc1 === '|')
) {
break
}
advance(P.L)
continue
}
}
if (c === '(') parenDepth++
else if (c === ')' && parenDepth > 0) parenDepth--
else if (c === '[') bracketDepth++
else if (c === ']' && bracketDepth > 0) bracketDepth--
advance(P.L)
}
if (P.L.b === start) return null
return mk(P, 'regex', start, P.L.b, [])
}
// RHS of ==/!=/= in [[ ]] β returns array of parts. Bare text β extglob_pattern
// (with paren counting for @(a|b)); $(...)/${}/quoted β proper node types.
// Multiple parts become flat children of binary_expression per tree-sitter.
function parseTestExtglobRhs(P: ParseState): TsNode[] {
skipBlanks(P.L)
const parts: TsNode[] = []
let segStart = P.L.b
let segStartI = P.L.i
let parenDepth = 0
const flushSeg = () => {
if (P.L.i > segStartI) {
const text = P.src.slice(segStartI, P.L.i)
// Pure number stays number; everything else is extglob_pattern
const type = /^\d+$/.test(text) ? 'number' : 'extglob_pattern'
parts.push(mk(P, type, segStart, P.L.b, []))
}
}
while (P.L.i < P.L.len) {
const c = peek(P.L)
if (c === '\\' && P.L.i + 1 < P.L.len) {
advance(P.L)
advance(P.L)
continue
}
if (c === '\n') break
if (parenDepth === 0) {
if (c === ']' && peek(P.L, 1) === ']') break
if (c === ' ' || c === '\t') {
let j = P.L.i
while (j < P.L.len && (P.L.src[j] === ' ' || P.L.src[j] === '\t')) j++
const nc = P.L.src[j] ?? ''
const nc1 = P.L.src[j + 1] ?? ''
if (
(nc === ']' && nc1 === ']') ||
(nc === '&' && nc1 === '&') ||
(nc === '|' && nc1 === '|')
) {
break
}
advance(P.L)
continue
}
}
// $ " ' must be parsed even inside @( ) extglob parens β parseDollarLike
// consumes matching ) so parenDepth stays consistent.
if (c === '$') {
const c1 = peek(P.L, 1)
if (
c1 === '(' ||
c1 === '{' ||
isIdentStart(c1) ||
SPECIAL_VARS.has(c1)
) {
flushSeg()
const exp = parseDollarLike(P)
if (exp) parts.push(exp)
segStart = P.L.b
segStartI = P.L.i
continue
}
}
if (c === '"') {
flushSeg()
parts.push(parseDoubleQuoted(P))
segStart = P.L.b
segStartI = P.L.i
continue
}
if (c === "'") {
flushSeg()
const tok = nextToken(P.L, 'arg')
parts.push(leaf(P, 'raw_string', tok))
segStart = P.L.b
segStartI = P.L.i
continue
}
if (c === '(') parenDepth++
else if (c === ')' && parenDepth > 0) parenDepth--
advance(P.L)
}
flushSeg()
return parts
}
function parseTestPrimary(P: ParseState, closer: string): TsNode | null {
skipBlanks(P.L)
// Stop at closer
if (closer === ']' && peek(P.L) === ']') return null
if (closer === ']]' && peek(P.L) === ']' && peek(P.L, 1) === ']') return null
return parseWord(P, 'arg')
}
/**
* Arithmetic context modes:
* - 'var': bare identifiers β variable_name (default, used in $((..)), ((..)))
* - 'word': bare identifiers β word (c-style for head condition/update clauses)
* - 'assign': identifiers with = β variable_assignment (c-style for init clause)
*/
type ArithMode = 'var' | 'word' | 'assign'
/** Operator precedence table (higher = tighter binding). */
const ARITH_PREC: Record<string, number> = {
'=': 2,
'+=': 2,
'-=': 2,
'*=': 2,
'/=': 2,
'%=': 2,
'<<=': 2,
'>>=': 2,
'&=': 2,
'^=': 2,
'|=': 2,
'||': 4,
'&&': 5,
'|': 6,
'^': 7,
'&': 8,
'==': 9,
'!=': 9,
'<': 10,
'>': 10,
'<=': 10,
'>=': 10,
'<<': 11,
'>>': 11,
'+': 12,
'-': 12,
'*': 13,
'/': 13,
'%': 13,
'**': 14,
}
/** Right-associative operators (assignment and exponent). */
const ARITH_RIGHT_ASSOC = new Set([
'=',
'+=',
'-=',
'*=',
'/=',
'%=',
'<<=',
'>>=',
'&=',
'^=',
'|=',
'**',
])
function parseArithExpr(
P: ParseState,
stop: string,
mode: ArithMode = 'var',
): TsNode | null {
return parseArithTernary(P, stop, mode)
}
/** Top-level: comma-separated list. arithmetic_expansion emits multiple children. */
function parseArithCommaList(
P: ParseState,
stop: string,
mode: ArithMode = 'var',
): TsNode[] {
const out: TsNode[] = []
while (true) {
const e = parseArithTernary(P, stop, mode)
if (e) out.push(e)
skipBlanks(P.L)
if (peek(P.L) === ',' && !isArithStop(P, stop)) {
advance(P.L)
continue
}
break
}
return out
}
function parseArithTernary(
P: ParseState,
stop: string,
mode: ArithMode,
): TsNode | null {
const cond = parseArithBinary(P, stop, 0, mode)
if (!cond) return null
skipBlanks(P.L)
if (peek(P.L) === '?') {
const qs = P.L.b
advance(P.L)
const q = mk(P, '?', qs, P.L.b, [])
const t = parseArithBinary(P, ':', 0, mode)
skipBlanks(P.L)
let colon: TsNode
if (peek(P.L) === ':') {
const cs = P.L.b
advance(P.L)
colon = mk(P, ':', cs, P.L.b, [])
} else {
colon = mk(P, ':', P.L.b, P.L.b, [])
}
const f = parseArithTernary(P, stop, mode)
const last = f ?? colon
const kids: TsNode[] = [cond, q]
if (t) kids.push(t)
kids.push(colon)
if (f) kids.push(f)
return mk(P, 'ternary_expression', cond.startIndex, last.endIndex, kids)
}
return cond
}
/** Scan next arithmetic binary operator; returns [text, length] or null. */
function scanArithOp(P: ParseState): [string, number] | null {
const c = peek(P.L)
const c1 = peek(P.L, 1)
const c2 = peek(P.L, 2)
// 3-char: <<= >>=
if (c === '<' && c1 === '<' && c2 === '=') return ['<<=', 3]
if (c === '>' && c1 === '>' && c2 === '=') return ['>>=', 3]
// 2-char
if (c === '*' && c1 === '*') return ['**', 2]
if (c === '<' && c1 === '<') return ['<<', 2]
if (c === '>' && c1 === '>') return ['>>', 2]
if (c === '=' && c1 === '=') return ['==', 2]
if (c === '!' && c1 === '=') return ['!=', 2]
if (c === '<' && c1 === '=') return ['<=', 2]
if (c === '>' && c1 === '=') return ['>=', 2]
if (c === '&' && c1 === '&') return ['&&', 2]
if (c === '|' && c1 === '|') return ['||', 2]
if (c === '+' && c1 === '=') return ['+=', 2]
if (c === '-' && c1 === '=') return ['-=', 2]
if (c === '*' && c1 === '=') return ['*=', 2]
if (c === '/' && c1 === '=') return ['/=', 2]
if (c === '%' && c1 === '=') return ['%=', 2]
if (c === '&' && c1 === '=') return ['&=', 2]
if (c === '^' && c1 === '=') return ['^=', 2]
if (c === '|' && c1 === '=') return ['|=', 2]
// 1-char β but NOT ++ -- (those are pre/postfix)
if (c === '+' && c1 !== '+') return ['+', 1]
if (c === '-' && c1 !== '-') return ['-', 1]
if (c === '*') return ['*', 1]
if (c === '/') return ['/', 1]
if (c === '%') return ['%', 1]
if (c === '<') return ['<', 1]
if (c === '>') return ['>', 1]
if (c === '&') return ['&', 1]
if (c === '|') return ['|', 1]
if (c === '^') return ['^', 1]
if (c === '=') return ['=', 1]
return null
}
/** Precedence-climbing binary expression parser. */
function parseArithBinary(
P: ParseState,
stop: string,
minPrec: number,
mode: ArithMode,
): TsNode | null {
let left = parseArithUnary(P, stop, mode)
if (!left) return null
while (true) {
skipBlanks(P.L)
if (isArithStop(P, stop)) break
if (peek(P.L) === ',') break
const opInfo = scanArithOp(P)
if (!opInfo) break
const [opText, opLen] = opInfo
const prec = ARITH_PREC[opText]
if (prec === undefined || prec < minPrec) break
const os = P.L.b
for (let k = 0; k < opLen; k++) advance(P.L)
const op = mk(P, opText, os, P.L.b, [])
const nextMin = ARITH_RIGHT_ASSOC.has(opText) ? prec : prec + 1
const right = parseArithBinary(P, stop, nextMin, mode)
if (!right) break
left = mk(P, 'binary_expression', left.startIndex, right.endIndex, [
left,
op,
right,
])
}
return left
}
function parseArithUnary(
P: ParseState,
stop: string,
mode: ArithMode,
): TsNode | null {
skipBlanks(P.L)
if (isArithStop(P, stop)) return null
const c = peek(P.L)
const c1 = peek(P.L, 1)
// Prefix ++ --
if ((c === '+' && c1 === '+') || (c === '-' && c1 === '-')) {
const s = P.L.b
advance(P.L)
advance(P.L)
const op = mk(P, c + c1, s, P.L.b, [])
const inner = parseArithUnary(P, stop, mode)
if (!inner) return op
return mk(P, 'unary_expression', op.startIndex, inner.endIndex, [op, inner])
}
if (c === '-' || c === '+' || c === '!' || c === '~') {
// In 'word'/'assign' mode (c-style for head), `-N` is a single number
// literal per tree-sitter, not unary_expression. 'var' mode uses unary.
if (mode !== 'var' && c === '-' && isDigit(c1)) {
const s = P.L.b
advance(P.L)
while (isDigit(peek(P.L))) advance(P.L)
return mk(P, 'number', s, P.L.b, [])
}
const s = P.L.b
advance(P.L)
const op = mk(P, c, s, P.L.b, [])
const inner = parseArithUnary(P, stop, mode)
if (!inner) return op
return mk(P, 'unary_expression', op.startIndex, inner.endIndex, [op, inner])
}
return parseArithPostfix(P, stop, mode)
}
function parseArithPostfix(
P: ParseState,
stop: string,
mode: ArithMode,
): TsNode | null {
const prim = parseArithPrimary(P, stop, mode)
if (!prim) return null
const c = peek(P.L)
const c1 = peek(P.L, 1)
if ((c === '+' && c1 === '+') || (c === '-' && c1 === '-')) {
const s = P.L.b
advance(P.L)
advance(P.L)
const op = mk(P, c + c1, s, P.L.b, [])
return mk(P, 'postfix_expression', prim.startIndex, op.endIndex, [prim, op])
}
return prim
}
function parseArithPrimary(
P: ParseState,
stop: string,
mode: ArithMode,
): TsNode | null {
skipBlanks(P.L)
if (isArithStop(P, stop)) return null
const c = peek(P.L)
if (c === '(') {
const s = P.L.b
advance(P.L)
const open = mk(P, '(', s, P.L.b, [])
// Parenthesized expression may contain comma-separated exprs
const inners = parseArithCommaList(P, ')', mode)
skipBlanks(P.L)
let close: TsNode
if (peek(P.L) === ')') {
const cs = P.L.b
advance(P.L)
close = mk(P, ')', cs, P.L.b, [])
} else {
close = mk(P, ')', P.L.b, P.L.b, [])
}
return mk(P, 'parenthesized_expression', open.startIndex, close.endIndex, [
open,
...inners,
close,
])
}
if (c === '"') {
return parseDoubleQuoted(P)
}
if (c === '$') {
return parseDollarLike(P)
}
if (isDigit(c)) {
const s = P.L.b
while (isDigit(peek(P.L))) advance(P.L)
// Hex: 0x1f
if (
P.L.b - s === 1 &&
c === '0' &&
(peek(P.L) === 'x' || peek(P.L) === 'X')
) {
advance(P.L)
while (isHexDigit(peek(P.L))) advance(P.L)
}
// Base notation: BASE#DIGITS e.g. 2#1010, 16#ff
else if (peek(P.L) === '#') {
advance(P.L)
while (isBaseDigit(peek(P.L))) advance(P.L)
}
return mk(P, 'number', s, P.L.b, [])
}
if (isIdentStart(c)) {
const s = P.L.b
while (isIdentChar(peek(P.L))) advance(P.L)
const nc = peek(P.L)
// Assignment in 'assign' mode (c-style for init): emit variable_assignment
// so chained `a = b = c = 1` nests correctly. Other modes treat `=` as a
// binary_expression operator via the precedence table.
if (mode === 'assign') {
skipBlanks(P.L)
const ac = peek(P.L)
const ac1 = peek(P.L, 1)
if (ac === '=' && ac1 !== '=') {
const vn = mk(P, 'variable_name', s, P.L.b, [])
const es = P.L.b
advance(P.L)
const eq = mk(P, '=', es, P.L.b, [])
// RHS may itself be another assignment (chained)
const val = parseArithTernary(P, stop, mode)
const end = val ? val.endIndex : eq.endIndex
const kids = val ? [vn, eq, val] : [vn, eq]
return mk(P, 'variable_assignment', s, end, kids)
}
}
// Subscript
if (nc === '[') {
const vn = mk(P, 'variable_name', s, P.L.b, [])
const brS = P.L.b
advance(P.L)
const brOpen = mk(P, '[', brS, P.L.b, [])
const idx = parseArithTernary(P, ']', 'var') ?? parseDollarLike(P)
skipBlanks(P.L)
let brClose: TsNode
if (peek(P.L) === ']') {
const cs = P.L.b
advance(P.L)
brClose = mk(P, ']', cs, P.L.b, [])
} else {
brClose = mk(P, ']', P.L.b, P.L.b, [])
}
const kids = idx ? [vn, brOpen, idx, brClose] : [vn, brOpen, brClose]
return mk(P, 'subscript', s, brClose.endIndex, kids)
}
// Bare identifier: variable_name in 'var' mode, word in 'word'/'assign' mode.
// 'assign' mode falls through to word when no `=` follows (c-style for
// cond/update clauses: `c<=5` β binary_expression(word, number)).
const identType = mode === 'var' ? 'variable_name' : 'word'
return mk(P, identType, s, P.L.b, [])
}
return null
}
function isArithStop(P: ParseState, stop: string): boolean {
const c = peek(P.L)
if (stop === '))') return c === ')' && peek(P.L, 1) === ')'
if (stop === ')') return c === ')'
if (stop === ';') return c === ';'
if (stop === ':') return c === ':'
if (stop === ']') return c === ']'
if (stop === '}') return c === '}'
if (stop === ':}') return c === ':' || c === '}'
return c === '' || c === '\n'
}