π File detail
utils/bash/treeSitterAnalysis.ts
π― Use case
This file lives under βutils/β, which covers cross-cutting helpers (shell, tempfiles, settings, messages, process input, β¦). On the API surface it exposes QuoteContext, CompoundStructure, DangerousPatterns, TreeSitterAnalysis, and extractQuoteContext (and more) β mainly functions, hooks, or classes. What the file header says: Tree-sitter AST analysis utilities for bash command security validation. These functions extract security-relevant information from tree-sitter parse trees, providing more accurate analysis than regex/shell-quote parsing. Each function takes a root node and command string, and re.
Generated from folder role, exports, dependency roots, and inline comments β not hand-reviewed for every path.
π§ Inline summary
Tree-sitter AST analysis utilities for bash command security validation. These functions extract security-relevant information from tree-sitter parse trees, providing more accurate analysis than regex/shell-quote parsing. Each function takes a root node and command string, and returns structured data that can be used by security validators. The native NAPI parser returns plain JS objects β no cleanup needed.
π€ Exports (heuristic)
QuoteContextCompoundStructureDangerousPatternsTreeSitterAnalysisextractQuoteContextextractCompoundStructurehasActualOperatorNodesextractDangerousPatternsanalyzeCommand
π₯οΈ Source preview
/**
* Tree-sitter AST analysis utilities for bash command security validation.
*
* These functions extract security-relevant information from tree-sitter
* parse trees, providing more accurate analysis than regex/shell-quote
* parsing. Each function takes a root node and command string, and returns
* structured data that can be used by security validators.
*
* The native NAPI parser returns plain JS objects β no cleanup needed.
*/
type TreeSitterNode = {
type: string
text: string
startIndex: number
endIndex: number
children: TreeSitterNode[]
childCount: number
}
export type QuoteContext = {
/** Command text with single-quoted content removed (double-quoted content preserved) */
withDoubleQuotes: string
/** Command text with all quoted content removed */
fullyUnquoted: string
/** Like fullyUnquoted but preserves quote characters (', ") */
unquotedKeepQuoteChars: string
}
export type CompoundStructure = {
/** Whether the command has compound operators (&&, ||, ;) at the top level */
hasCompoundOperators: boolean
/** Whether the command has pipelines */
hasPipeline: boolean
/** Whether the command has subshells */
hasSubshell: boolean
/** Whether the command has command groups ({...}) */
hasCommandGroup: boolean
/** Top-level compound operator types found */
operators: string[]
/** Individual command segments split by compound operators */
segments: string[]
}
export type DangerousPatterns = {
/** Has $() or backtick command substitution (outside quotes that would make it safe) */
hasCommandSubstitution: boolean
/** Has <() or >() process substitution */
hasProcessSubstitution: boolean
/** Has ${...} parameter expansion */
hasParameterExpansion: boolean
/** Has heredoc */
hasHeredoc: boolean
/** Has comment */
hasComment: boolean
}
export type TreeSitterAnalysis = {
quoteContext: QuoteContext
compoundStructure: CompoundStructure
/** Whether actual operator nodes (;, &&, ||) exist β if false, \; is just a word argument */
hasActualOperatorNodes: boolean
dangerousPatterns: DangerousPatterns
}
type QuoteSpans = {
raw: Array<[number, number]> // raw_string (single-quoted)
ansiC: Array<[number, number]> // ansi_c_string ($'...')
double: Array<[number, number]> // string (double-quoted)
heredoc: Array<[number, number]> // quoted heredoc_redirect
}
/**
* Single-pass collection of all quote-related spans.
* Previously this was 5 separate tree walks (one per type-set plus
* allQuoteTypes plus heredoc); fusing cuts tree-traversal ~5x.
*
* Replicates the per-type walk semantics: each original walk stopped at
* its own type. So the raw_string walk would recurse THROUGH a string
* node (not its type) to reach nested raw_string inside $(...), but the
* string walk would stop at the outer string. We track `inDouble` to
* collect the *outermost* string span per path, while still descending
* into $()/${} bodies to pick up inner raw_string/ansi_c_string.
*
* raw_string / ansi_c_string / quoted-heredoc bodies are literal text
* in bash (no expansion), so no nested quote nodes exist β return early.
*/
function collectQuoteSpans(
node: TreeSitterNode,
out: QuoteSpans,
inDouble: boolean,
): void {
switch (node.type) {
case 'raw_string':
out.raw.push([node.startIndex, node.endIndex])
return // literal body, no nested quotes possible
case 'ansi_c_string':
out.ansiC.push([node.startIndex, node.endIndex])
return // literal body
case 'string':
// Only collect the outermost string (matches old per-type walk
// which stops at first match). Recurse regardless β a nested
// $(cmd 'x') inside "..." has a real inner raw_string.
if (!inDouble) out.double.push([node.startIndex, node.endIndex])
for (const child of node.children) {
if (child) collectQuoteSpans(child, out, true)
}
return
case 'heredoc_redirect': {
// Quoted heredocs (<<'EOF', <<"EOF", <<\EOF): literal body.
// Unquoted (<<EOF) expands $()/${} β the body can contain
// $(cmd 'x') whose inner '...' IS a real raw_string node.
// Detection: heredoc_start text starts with '/"/\\
// Matches sync path's extractHeredocs({ quotedOnly: true }).
let isQuoted = false
for (const child of node.children) {
if (child && child.type === 'heredoc_start') {
const first = child.text[0]
isQuoted = first === "'" || first === '"' || first === '\\'
break
}
}
if (isQuoted) {
out.heredoc.push([node.startIndex, node.endIndex])
return // literal body, no nested quote nodes
}
// Unquoted: recurse into heredoc_body β command_substitution β
// inner quote nodes. The original per-type walks did NOT stop at
// heredoc_redirect (not in their type sets), so they recursed here.
break
}
}
for (const child of node.children) {
if (child) collectQuoteSpans(child, out, inDouble)
}
}
/**
* Builds a Set of all character positions covered by the given spans.
*/
function buildPositionSet(spans: Array<[number, number]>): Set<number> {
const set = new Set<number>()
for (const [start, end] of spans) {
for (let i = start; i < end; i++) {
set.add(i)
}
}
return set
}
/**
* Drops spans that are fully contained within another span, keeping only the
* outermost. Nested quotes (e.g., `"$(echo 'hi')"`) yield overlapping spans
* β the inner raw_string is found by recursing into the outer string node.
* Processing overlapping spans corrupts indices since removing/replacing the
* outer span shifts the inner span's start/end into stale positions.
*/
function dropContainedSpans<T extends readonly [number, number, ...unknown[]]>(
spans: T[],
): T[] {
return spans.filter(
(s, i) =>
!spans.some(
(other, j) =>
j !== i &&
other[0] <= s[0] &&
other[1] >= s[1] &&
(other[0] < s[0] || other[1] > s[1]),
),
)
}
/**
* Removes spans from a string, returning the string with those character
* ranges removed.
*/
function removeSpans(command: string, spans: Array<[number, number]>): string {
if (spans.length === 0) return command
// Drop inner spans that are fully contained in an outer one, then sort by
// start index descending so we can splice without offset shifts.
const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0])
let result = command
for (const [start, end] of sorted) {
result = result.slice(0, start) + result.slice(end)
}
return result
}
/**
* Replaces spans with just the quote delimiters (preserving ' and " characters).
*/
function replaceSpansKeepQuotes(
command: string,
spans: Array<[number, number, string, string]>,
): string {
if (spans.length === 0) return command
const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0])
let result = command
for (const [start, end, open, close] of sorted) {
// Replace content but keep the quote delimiters
result = result.slice(0, start) + open + close + result.slice(end)
}
return result
}
/**
* Extract quote context from the tree-sitter AST.
* Replaces the manual character-by-character extractQuotedContent() function.
*
* Tree-sitter node types:
* - raw_string: single-quoted ('...')
* - string: double-quoted ("...")
* - ansi_c_string: ANSI-C quoting ($'...') β span includes the leading $
* - heredoc_redirect: QUOTED heredocs only (<<'EOF', <<"EOF", <<\EOF) β
* the full redirect span (<<, delimiters, body, newlines) is stripped
* since the body is literal text in bash (no expansion). UNQUOTED
* heredocs (<<EOF) are left in place since bash expands $(...)/${...}
* inside them, and validators need to see those patterns. Matches the
* sync path's extractHeredocs({ quotedOnly: true }).
*/
export function extractQuoteContext(
rootNode: unknown,
command: string,
): QuoteContext {
// Single walk collects all quote span types at once.
const spans: QuoteSpans = { raw: [], ansiC: [], double: [], heredoc: [] }
collectQuoteSpans(rootNode as TreeSitterNode, spans, false)
const singleQuoteSpans = spans.raw
const ansiCSpans = spans.ansiC
const doubleQuoteSpans = spans.double
const quotedHeredocSpans = spans.heredoc
const allQuoteSpans = [
...singleQuoteSpans,
...ansiCSpans,
...doubleQuoteSpans,
...quotedHeredocSpans,
]
// Build a set of positions that should be excluded for each output variant.
// For withDoubleQuotes: remove single-quoted spans entirely, plus the
// opening/closing `"` delimiters of double-quoted spans (but keep the
// content between them). This matches the regex extractQuotedContent()
// semantics where `"` toggles quote state but content is still emitted.
const singleQuoteSet = buildPositionSet([
...singleQuoteSpans,
...ansiCSpans,
...quotedHeredocSpans,
])
const doubleQuoteDelimSet = new Set<number>()
for (const [start, end] of doubleQuoteSpans) {
doubleQuoteDelimSet.add(start) // opening "
doubleQuoteDelimSet.add(end - 1) // closing "
}
let withDoubleQuotes = ''
for (let i = 0; i < command.length; i++) {
if (singleQuoteSet.has(i)) continue
if (doubleQuoteDelimSet.has(i)) continue
withDoubleQuotes += command[i]
}
// fullyUnquoted: remove all quoted content
const fullyUnquoted = removeSpans(command, allQuoteSpans)
// unquotedKeepQuoteChars: remove content but keep delimiter chars
const spansWithQuoteChars: Array<[number, number, string, string]> = []
for (const [start, end] of singleQuoteSpans) {
spansWithQuoteChars.push([start, end, "'", "'"])
}
for (const [start, end] of ansiCSpans) {
// ansi_c_string spans include the leading $; preserve it so this
// matches the regex path, which treats $ as unquoted preceding '.
spansWithQuoteChars.push([start, end, "$'", "'"])
}
for (const [start, end] of doubleQuoteSpans) {
spansWithQuoteChars.push([start, end, '"', '"'])
}
for (const [start, end] of quotedHeredocSpans) {
// Heredoc redirect spans have no inline quote delimiters β strip entirely.
spansWithQuoteChars.push([start, end, '', ''])
}
const unquotedKeepQuoteChars = replaceSpansKeepQuotes(
command,
spansWithQuoteChars,
)
return { withDoubleQuotes, fullyUnquoted, unquotedKeepQuoteChars }
}
/**
* Extract compound command structure from the AST.
* Replaces isUnsafeCompoundCommand() and splitCommand() for tree-sitter path.
*/
export function extractCompoundStructure(
rootNode: unknown,
command: string,
): CompoundStructure {
const n = rootNode as TreeSitterNode
const operators: string[] = []
const segments: string[] = []
let hasSubshell = false
let hasCommandGroup = false
let hasPipeline = false
// Walk top-level children of the program node
function walkTopLevel(node: TreeSitterNode): void {
for (const child of node.children) {
if (!child) continue
if (child.type === 'list') {
// list nodes contain && and || operators
for (const listChild of child.children) {
if (!listChild) continue
if (listChild.type === '&&' || listChild.type === '||') {
operators.push(listChild.type)
} else if (
listChild.type === 'list' ||
listChild.type === 'redirected_statement'
) {
// Nested list, or redirected_statement wrapping a list/pipeline β
// recurse so inner operators/pipelines are detected. For
// `cmd1 && cmd2 2>/dev/null && cmd3`, the redirected_statement
// wraps `list(cmd1 && cmd2)` β the inner `&&` would be missed
// without recursion.
walkTopLevel({ ...node, children: [listChild] } as TreeSitterNode)
} else if (listChild.type === 'pipeline') {
hasPipeline = true
segments.push(listChild.text)
} else if (listChild.type === 'subshell') {
hasSubshell = true
segments.push(listChild.text)
} else if (listChild.type === 'compound_statement') {
hasCommandGroup = true
segments.push(listChild.text)
} else {
segments.push(listChild.text)
}
}
} else if (child.type === ';') {
operators.push(';')
} else if (child.type === 'pipeline') {
hasPipeline = true
segments.push(child.text)
} else if (child.type === 'subshell') {
hasSubshell = true
segments.push(child.text)
} else if (child.type === 'compound_statement') {
hasCommandGroup = true
segments.push(child.text)
} else if (
child.type === 'command' ||
child.type === 'declaration_command' ||
child.type === 'variable_assignment'
) {
segments.push(child.text)
} else if (child.type === 'redirected_statement') {
// `cd ~/src && find path 2>/dev/null` β tree-sitter wraps the ENTIRE
// compound in a redirected_statement: program β redirected_statement β
// (list β cmd1, &&, cmd2) + file_redirect. Same for `cmd1 | cmd2 > out`
// (wraps pipeline) and `(cmd) > out` (wraps subshell). Recurse to
// detect the inner structure; skip file_redirect children (redirects
// don't affect compound/pipeline classification).
let foundInner = false
for (const inner of child.children) {
if (!inner || inner.type === 'file_redirect') continue
foundInner = true
walkTopLevel({ ...child, children: [inner] } as TreeSitterNode)
}
if (!foundInner) {
// Standalone redirect with no body (shouldn't happen, but fail-safe)
segments.push(child.text)
}
} else if (child.type === 'negated_command') {
// `! cmd` β recurse into the inner command so its structure is
// classified (pipeline/subshell/etc.), but also record the full
// negated text as a segment so segments.length stays meaningful.
segments.push(child.text)
walkTopLevel(child)
} else if (
child.type === 'if_statement' ||
child.type === 'while_statement' ||
child.type === 'for_statement' ||
child.type === 'case_statement' ||
child.type === 'function_definition'
) {
// Control-flow constructs: the construct itself is one segment,
// but recurse so inner pipelines/subshells/operators are detected.
segments.push(child.text)
walkTopLevel(child)
}
}
}
walkTopLevel(n)
// If no segments found, the whole command is one segment
if (segments.length === 0) {
segments.push(command)
}
return {
hasCompoundOperators: operators.length > 0,
hasPipeline,
hasSubshell,
hasCommandGroup,
operators,
segments,
}
}
/**
* Check whether the AST contains actual operator nodes (;, &&, ||).
*
* This is the key function for eliminating the `find -exec \;` false positive.
* Tree-sitter parses `\;` as part of a `word` node (an argument to find),
* NOT as a `;` operator. So if no actual `;` operator nodes exist in the AST,
* there are no compound operators and hasBackslashEscapedOperator() can be skipped.
*/
export function hasActualOperatorNodes(rootNode: unknown): boolean {
const n = rootNode as TreeSitterNode
function walk(node: TreeSitterNode): boolean {
// Check for operator types that indicate compound commands
if (node.type === ';' || node.type === '&&' || node.type === '||') {
// Verify this is a child of a list or program, not inside a command
return true
}
if (node.type === 'list') {
// A list node means there are compound operators
return true
}
for (const child of node.children) {
if (child && walk(child)) return true
}
return false
}
return walk(n)
}
/**
* Extract dangerous pattern information from the AST.
*/
export function extractDangerousPatterns(rootNode: unknown): DangerousPatterns {
const n = rootNode as TreeSitterNode
let hasCommandSubstitution = false
let hasProcessSubstitution = false
let hasParameterExpansion = false
let hasHeredoc = false
let hasComment = false
function walk(node: TreeSitterNode): void {
switch (node.type) {
case 'command_substitution':
hasCommandSubstitution = true
break
case 'process_substitution':
hasProcessSubstitution = true
break
case 'expansion':
hasParameterExpansion = true
break
case 'heredoc_redirect':
hasHeredoc = true
break
case 'comment':
hasComment = true
break
}
for (const child of node.children) {
if (child) walk(child)
}
}
walk(n)
return {
hasCommandSubstitution,
hasProcessSubstitution,
hasParameterExpansion,
hasHeredoc,
hasComment,
}
}
/**
* Perform complete tree-sitter analysis of a command.
* Extracts all security-relevant data from the AST in one pass.
* This data must be extracted before tree.delete() is called.
*/
export function analyzeCommand(
rootNode: unknown,
command: string,
): TreeSitterAnalysis {
return {
quoteContext: extractQuoteContext(rootNode, command),
compoundStructure: extractCompoundStructure(rootNode, command),
hasActualOperatorNodes: hasActualOperatorNodes(rootNode),
dangerousPatterns: extractDangerousPatterns(rootNode),
}
}