π File detail
utils/computerUse/executor.ts
π― Use case
This file lives under βutils/β, which covers cross-cutting helpers (shell, tempfiles, settings, messages, process input, β¦). On the API surface it exposes createCliExecutor and unhideComputerUseApps β mainly functions, hooks, or classes. Dependencies touch @ant. It composes internal code from debug, errors, execFileNoThrow, sleep, and common (relative imports). What the file header says: CLI `ComputerExecutor` implementation. Wraps two native modules: - `@ant/computer-use-input` (Rust/enigo) β mouse, keyboard, frontmost app - `@ant/computer-use-swift` β SCContentFilter screenshots, NSWorkspace apps, TCC Contract: `packages/desktop/computer-use-mcp/src/executor.ts.
Generated from folder role, exports, dependency roots, and inline comments β not hand-reviewed for every path.
π§ Inline summary
CLI `ComputerExecutor` implementation. Wraps two native modules: - `@ant/computer-use-input` (Rust/enigo) β mouse, keyboard, frontmost app - `@ant/computer-use-swift` β SCContentFilter screenshots, NSWorkspace apps, TCC Contract: `packages/desktop/computer-use-mcp/src/executor.ts` in the apps repo. The reference impl is Cowork's `apps/desktop/src/main/nest-only/ computer-use/executor.ts` β see notable deviations under "CLI deltas" below. ββ CLI deltas from Cowork βββββββββββββββββββββββββββββββββββββββββββββββββ No `withClickThrough`. Cowork wraps every mouse op in `BrowserWindow.setIgnoreMouseEvents(true)` so clicks fall through the overlay. We're a terminal β no window β so the click-through bracket is a no-op. The sentinel `CLI_HOST_BUNDLE_ID` never matches frontmost. Terminal as surrogate host. `getTerminalBundleId()` detects the emulator we're running inside. It's passed as `hostBundleId` to `prepareDisplay`/ `resolvePrepareCapture` so the Swift side exempts it from hide AND skips it in the activate z-order walk (so the terminal being frontmost doesn't eat clicks meant for the target app). Also stripped from `allowedBundleIds` via `withoutTerminal()` so
π€ Exports (heuristic)
createCliExecutorunhideComputerUseApps
π External import roots
Package roots from from "β¦" (relative paths omitted).
@ant
π₯οΈ Source preview
/**
* CLI `ComputerExecutor` implementation. Wraps two native modules:
* - `@ant/computer-use-input` (Rust/enigo) β mouse, keyboard, frontmost app
* - `@ant/computer-use-swift` β SCContentFilter screenshots, NSWorkspace apps, TCC
*
* Contract: `packages/desktop/computer-use-mcp/src/executor.ts` in the apps
* repo. The reference impl is Cowork's `apps/desktop/src/main/nest-only/
* computer-use/executor.ts` β see notable deviations under "CLI deltas" below.
*
* ββ CLI deltas from Cowork βββββββββββββββββββββββββββββββββββββββββββββββββ
*
* No `withClickThrough`. Cowork wraps every mouse op in
* `BrowserWindow.setIgnoreMouseEvents(true)` so clicks fall through the
* overlay. We're a terminal β no window β so the click-through bracket is
* a no-op. The sentinel `CLI_HOST_BUNDLE_ID` never matches frontmost.
*
* Terminal as surrogate host. `getTerminalBundleId()` detects the emulator
* we're running inside. It's passed as `hostBundleId` to `prepareDisplay`/
* `resolvePrepareCapture` so the Swift side exempts it from hide AND skips
* it in the activate z-order walk (so the terminal being frontmost doesn't
* eat clicks meant for the target app). Also stripped from `allowedBundleIds`
* via `withoutTerminal()` so screenshots don't capture it (Swift 0.2.1's
* captureExcluding takes an allow-list despite the name β apps#30355).
* `capabilities.hostBundleId` stays as the sentinel β the package's
* frontmost gate uses that, and the terminal being frontmost is fine.
*
* Clipboard via `pbcopy`/`pbpaste`. No Electron `clipboard` module.
*/
import type {
ComputerExecutor,
DisplayGeometry,
FrontmostApp,
InstalledApp,
ResolvePrepareCaptureResult,
RunningApp,
ScreenshotResult,
} from '@ant/computer-use-mcp'
import { API_RESIZE_PARAMS, targetImageSize } from '@ant/computer-use-mcp'
import { logForDebugging } from '../debug.js'
import { errorMessage } from '../errors.js'
import { execFileNoThrow } from '../execFileNoThrow.js'
import { sleep } from '../sleep.js'
import {
CLI_CU_CAPABILITIES,
CLI_HOST_BUNDLE_ID,
getTerminalBundleId,
} from './common.js'
import { drainRunLoop } from './drainRunLoop.js'
import { notifyExpectedEscape } from './escHotkey.js'
import { requireComputerUseInput } from './inputLoader.js'
import { requireComputerUseSwift } from './swiftLoader.js'
// ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const SCREENSHOT_JPEG_QUALITY = 0.75
/** Logical β physical β API target dims. See `targetImageSize` + COORDINATES.md. */
function computeTargetDims(
logicalW: number,
logicalH: number,
scaleFactor: number,
): [number, number] {
const physW = Math.round(logicalW * scaleFactor)
const physH = Math.round(logicalH * scaleFactor)
return targetImageSize(physW, physH, API_RESIZE_PARAMS)
}
async function readClipboardViaPbpaste(): Promise<string> {
const { stdout, code } = await execFileNoThrow('pbpaste', [], {
useCwd: false,
})
if (code !== 0) {
throw new Error(`pbpaste exited with code ${code}`)
}
return stdout
}
async function writeClipboardViaPbcopy(text: string): Promise<void> {
const { code } = await execFileNoThrow('pbcopy', [], {
input: text,
useCwd: false,
})
if (code !== 0) {
throw new Error(`pbcopy exited with code ${code}`)
}
}
type Input = ReturnType<typeof requireComputerUseInput>
/**
* Single-element key sequence matching "escape" or "esc" (case-insensitive).
* Used to hole-punch the CGEventTap abort for model-synthesized Escape β enigo
* accepts both spellings, so the tap must too.
*/
function isBareEscape(parts: readonly string[]): boolean {
if (parts.length !== 1) return false
const lower = parts[0]!.toLowerCase()
return lower === 'escape' || lower === 'esc'
}
/**
* Instant move, then 50ms β an inputβHIDβAppKitβNSEvent round-trip before the
* caller reads `NSEvent.mouseLocation` or dispatches a click. Used for click,
* scroll, and drag-from; `animatedMove` is reserved for drag-to only. The
* intermediate animation frames were triggering hover states and, on the
* decomposed mouseDown/moveMouse path, emitting stray `.leftMouseDragged`
* events (toolCalls.ts handleScroll's mouse_full workaround).
*/
const MOVE_SETTLE_MS = 50
async function moveAndSettle(
input: Input,
x: number,
y: number,
): Promise<void> {
await input.moveMouse(x, y, false)
await sleep(MOVE_SETTLE_MS)
}
/**
* Release `pressed` in reverse (last pressed = first released). Errors are
* swallowed so a release failure never masks the real error.
*
* Drains via pop() rather than snapshotting length: if a drainRunLoop-
* orphaned press lambda resolves an in-flight input.key() AFTER finally
* calls us, that late push is still released on the next iteration. The
* orphaned flag stops the lambda at its NEXT check, not the current await.
*/
async function releasePressed(input: Input, pressed: string[]): Promise<void> {
let k: string | undefined
while ((k = pressed.pop()) !== undefined) {
try {
await input.key(k, 'release')
} catch {
// Swallow β best-effort release.
}
}
}
/**
* Bracket `fn()` with modifier press/release. `pressed` tracks which presses
* actually landed, so a mid-press throw only releases what was pressed β no
* stuck modifiers. The finally covers both press-phase and fn() throws.
*
* Caller must already be inside drainRunLoop() β key() dispatches to the
* main queue and needs the pump to resolve.
*/
async function withModifiers<T>(
input: Input,
mods: string[],
fn: () => Promise<T>,
): Promise<T> {
const pressed: string[] = []
try {
for (const m of mods) {
await input.key(m, 'press')
pressed.push(m)
}
return await fn()
} finally {
await releasePressed(input, pressed)
}
}
/**
* Port of Cowork's `typeViaClipboard`. Sequence:
* 1. Save the user's clipboard.
* 2. Write our text.
* 3. READ-BACK VERIFY β clipboard writes can silently fail. If the
* read-back doesn't match, never press Cmd+V (would paste junk).
* 4. Cmd+V via keys().
* 5. Sleep 100ms β battle-tested threshold for the paste-effect vs
* clipboard-restore race. Restoring too soon means the target app
* pastes the RESTORED content.
* 6. Restore β in a `finally`, so a throw between 2-5 never leaves the
* user's clipboard clobbered. Restore failures are swallowed.
*/
async function typeViaClipboard(input: Input, text: string): Promise<void> {
let saved: string | undefined
try {
saved = await readClipboardViaPbpaste()
} catch {
logForDebugging(
'[computer-use] pbpaste before paste failed; proceeding without restore',
)
}
try {
await writeClipboardViaPbcopy(text)
if ((await readClipboardViaPbpaste()) !== text) {
throw new Error('Clipboard write did not round-trip.')
}
await input.keys(['command', 'v'])
await sleep(100)
} finally {
if (typeof saved === 'string') {
try {
await writeClipboardViaPbcopy(saved)
} catch {
logForDebugging('[computer-use] clipboard restore after paste failed')
}
}
}
}
/**
* Port of Cowork's `animateMouseMovement` + `animatedMove`. Ease-out-cubic at
* 60fps; distance-proportional duration at 2000 px/sec, capped at 0.5s. When
* the sub-gate is off (or distance < ~2 frames), falls through to
* `moveAndSettle`. Called only from `drag` for the pressβto motion β target
* apps may watch for `.leftMouseDragged` specifically (not just "button down +
* position changed") and the slow motion gives them time to process
* intermediate positions (scrollbars, window resizes).
*/
async function animatedMove(
input: Input,
targetX: number,
targetY: number,
mouseAnimationEnabled: boolean,
): Promise<void> {
if (!mouseAnimationEnabled) {
await moveAndSettle(input, targetX, targetY)
return
}
const start = await input.mouseLocation()
const deltaX = targetX - start.x
const deltaY = targetY - start.y
const distance = Math.hypot(deltaX, deltaY)
if (distance < 1) return
const durationSec = Math.min(distance / 2000, 0.5)
if (durationSec < 0.03) {
await moveAndSettle(input, targetX, targetY)
return
}
const frameRate = 60
const frameIntervalMs = 1000 / frameRate
const totalFrames = Math.floor(durationSec * frameRate)
for (let frame = 1; frame <= totalFrames; frame++) {
const t = frame / totalFrames
const eased = 1 - Math.pow(1 - t, 3)
await input.moveMouse(
Math.round(start.x + deltaX * eased),
Math.round(start.y + deltaY * eased),
false,
)
if (frame < totalFrames) {
await sleep(frameIntervalMs)
}
}
// Last frame has no trailing sleep β same HID round-trip before the
// caller's mouseButton reads NSEvent.mouseLocation.
await sleep(MOVE_SETTLE_MS)
}
// ββ Factory βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
export function createCliExecutor(opts: {
getMouseAnimationEnabled: () => boolean
getHideBeforeActionEnabled: () => boolean
}): ComputerExecutor {
if (process.platform !== 'darwin') {
throw new Error(
`createCliExecutor called on ${process.platform}. Computer control is macOS-only.`,
)
}
// Swift loaded once at factory time β every executor method needs it.
// Input loaded lazily via requireComputerUseInput() on first mouse/keyboard
// call β it caches internally, so screenshot-only flows never pull the
// enigo .node.
const cu = requireComputerUseSwift()
const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts
const terminalBundleId = getTerminalBundleId()
const surrogateHost = terminalBundleId ?? CLI_HOST_BUNDLE_ID
// Swift 0.2.1's captureExcluding/captureRegion take an ALLOW list despite the
// name (apps#30355 β complement computed Swift-side against running apps).
// The terminal isn't in the user's grants so it's naturally excluded, but if
// the package ever passes it through we strip it here so the terminal never
// photobombs a screenshot.
const withoutTerminal = (allowed: readonly string[]): string[] =>
terminalBundleId === null
? [...allowed]
: allowed.filter(id => id !== terminalBundleId)
logForDebugging(
terminalBundleId
? `[computer-use] terminal ${terminalBundleId} β surrogate host (hide-exempt, activate-skip, screenshot-excluded)`
: '[computer-use] terminal not detected; falling back to sentinel host',
)
return {
capabilities: {
...CLI_CU_CAPABILITIES,
hostBundleId: CLI_HOST_BUNDLE_ID,
},
// ββ Pre-action sequence (hide + defocus) ββββββββββββββββββββββββββββ
async prepareForAction(
allowlistBundleIds: string[],
displayId?: number,
): Promise<string[]> {
if (!getHideBeforeActionEnabled()) {
return []
}
// prepareDisplay isn't @MainActor (plain Task{}), but its .hide() calls
// trigger window-manager events that queue on CFRunLoop. Without the
// pump, those pile up during Swift's ~1s of usleeps and flush all at
// once when the next pumped call runs β visible window flashing.
// Electron drains CFRunLoop continuously so Cowork doesn't see this.
// Worst-case 100ms + 5Γ200ms safety-net β 1.1s, well under the 30s
// drainRunLoop ceiling.
//
// "Continue with action execution even if switching fails" β the
// frontmost gate in toolCalls.ts catches any actual unsafe state.
return drainRunLoop(async () => {
try {
const result = await cu.apps.prepareDisplay(
allowlistBundleIds,
surrogateHost,
displayId,
)
if (result.activated) {
logForDebugging(
`[computer-use] prepareForAction: activated ${result.activated}`,
)
}
return result.hidden
} catch (err) {
logForDebugging(
`[computer-use] prepareForAction failed; continuing to action: ${errorMessage(err)}`,
{ level: 'warn' },
)
return []
}
})
},
async previewHideSet(
allowlistBundleIds: string[],
displayId?: number,
): Promise<Array<{ bundleId: string; displayName: string }>> {
return cu.apps.previewHideSet(
[...allowlistBundleIds, surrogateHost],
displayId,
)
},
// ββ Display ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
async getDisplaySize(displayId?: number): Promise<DisplayGeometry> {
return cu.display.getSize(displayId)
},
async listDisplays(): Promise<DisplayGeometry[]> {
return cu.display.listAll()
},
async findWindowDisplays(
bundleIds: string[],
): Promise<Array<{ bundleId: string; displayIds: number[] }>> {
return cu.apps.findWindowDisplays(bundleIds)
},
async resolvePrepareCapture(opts: {
allowedBundleIds: string[]
preferredDisplayId?: number
autoResolve: boolean
doHide?: boolean
}): Promise<ResolvePrepareCaptureResult> {
const d = cu.display.getSize(opts.preferredDisplayId)
const [targetW, targetH] = computeTargetDims(
d.width,
d.height,
d.scaleFactor,
)
return drainRunLoop(() =>
cu.resolvePrepareCapture(
withoutTerminal(opts.allowedBundleIds),
surrogateHost,
SCREENSHOT_JPEG_QUALITY,
targetW,
targetH,
opts.preferredDisplayId,
opts.autoResolve,
opts.doHide,
),
)
},
/**
* Pre-size to `targetImageSize` output so the API transcoder's early-return
* fires β no server-side resize, `scaleCoord` stays coherent. See
* packages/desktop/computer-use-mcp/COORDINATES.md.
*/
async screenshot(opts: {
allowedBundleIds: string[]
displayId?: number
}): Promise<ScreenshotResult> {
const d = cu.display.getSize(opts.displayId)
const [targetW, targetH] = computeTargetDims(
d.width,
d.height,
d.scaleFactor,
)
return drainRunLoop(() =>
cu.screenshot.captureExcluding(
withoutTerminal(opts.allowedBundleIds),
SCREENSHOT_JPEG_QUALITY,
targetW,
targetH,
opts.displayId,
),
)
},
async zoom(
regionLogical: { x: number; y: number; w: number; h: number },
allowedBundleIds: string[],
displayId?: number,
): Promise<{ base64: string; width: number; height: number }> {
const d = cu.display.getSize(displayId)
const [outW, outH] = computeTargetDims(
regionLogical.w,
regionLogical.h,
d.scaleFactor,
)
return drainRunLoop(() =>
cu.screenshot.captureRegion(
withoutTerminal(allowedBundleIds),
regionLogical.x,
regionLogical.y,
regionLogical.w,
regionLogical.h,
outW,
outH,
SCREENSHOT_JPEG_QUALITY,
displayId,
),
)
},
// ββ Keyboard βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
/**
* xdotool-style sequence e.g. "ctrl+shift+a" β split on '+' and pass to
* keys(). keys() dispatches to DispatchQueue.main β drainRunLoop pumps
* CFRunLoop so it resolves. Rust's error-path cleanup (enigo_wrap.rs)
* releases modifiers on each invocation, so a mid-loop throw leaves
* nothing stuck. 8ms between iterations β 125Hz USB polling cadence.
*/
async key(keySequence: string, repeat?: number): Promise<void> {
const input = requireComputerUseInput()
const parts = keySequence.split('+').filter(p => p.length > 0)
// Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape
// etc. pass through without aborting.
const isEsc = isBareEscape(parts)
const n = repeat ?? 1
await drainRunLoop(async () => {
for (let i = 0; i < n; i++) {
if (i > 0) {
await sleep(8)
}
if (isEsc) {
notifyExpectedEscape()
}
await input.keys(parts)
}
})
},
async holdKey(keyNames: string[], durationMs: number): Promise<void> {
const input = requireComputerUseInput()
// Press/release each wrapped in drainRunLoop; the sleep sits outside so
// durationMs isn't bounded by drainRunLoop's 30s timeout. `pressed`
// tracks which presses landed so a mid-press throw still releases
// everything that was actually pressed.
//
// `orphaned` guards against a timeout-orphan race: if the press-phase
// drainRunLoop times out while the esc-hotkey pump-retain keeps the
// pump running, the orphaned lambda would continue pushing to `pressed`
// after finally's releasePressed snapshotted the length β leaving keys
// stuck. The flag stops the lambda at the next iteration.
const pressed: string[] = []
let orphaned = false
try {
await drainRunLoop(async () => {
for (const k of keyNames) {
if (orphaned) return
// Bare Escape: notify the CGEventTap so it doesn't fire the
// abort callback for a model-synthesized press. Same as key().
if (isBareEscape([k])) {
notifyExpectedEscape()
}
await input.key(k, 'press')
pressed.push(k)
}
})
await sleep(durationMs)
} finally {
orphaned = true
await drainRunLoop(() => releasePressed(input, pressed))
}
},
async type(text: string, opts: { viaClipboard: boolean }): Promise<void> {
const input = requireComputerUseInput()
if (opts.viaClipboard) {
// keys(['command','v']) inside needs the pump.
await drainRunLoop(() => typeViaClipboard(input, text))
return
}
// `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this
// once per grapheme. typeText doesn't dispatch to the main queue.
await input.typeText(text)
},
readClipboard: readClipboardViaPbpaste,
writeClipboard: writeClipboardViaPbcopy,
// ββ Mouse ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
async moveMouse(x: number, y: number): Promise<void> {
await moveAndSettle(requireComputerUseInput(), x, y)
},
/**
* Move, then click. Modifiers are press/release bracketed via withModifiers
* β same pattern as Cowork. AppKit computes NSEvent.clickCount from timing
* + position proximity, so double/triple click work without setting the
* CGEvent clickState field. key() inside withModifiers needs the pump;
* the modifier-less path doesn't.
*/
async click(
x: number,
y: number,
button: 'left' | 'right' | 'middle',
count: 1 | 2 | 3,
modifiers?: string[],
): Promise<void> {
const input = requireComputerUseInput()
await moveAndSettle(input, x, y)
if (modifiers && modifiers.length > 0) {
await drainRunLoop(() =>
withModifiers(input, modifiers, () =>
input.mouseButton(button, 'click', count),
),
)
} else {
await input.mouseButton(button, 'click', count)
}
},
async mouseDown(): Promise<void> {
await requireComputerUseInput().mouseButton('left', 'press')
},
async mouseUp(): Promise<void> {
await requireComputerUseInput().mouseButton('left', 'release')
},
async getCursorPosition(): Promise<{ x: number; y: number }> {
return requireComputerUseInput().mouseLocation()
},
/**
* `from === undefined` β drag from current cursor (training's
* left_click_drag with start_coordinate omitted). Inner `finally`: the
* button is ALWAYS released even if the move throws β otherwise the
* user's left button is stuck-pressed until they physically click.
* 50ms sleep after press: enigo's move_mouse reads NSEvent.pressedMouseButtons
* to decide .leftMouseDragged vs .mouseMoved; the synthetic leftMouseDown
* needs a HID-tap round-trip to show up there.
*/
async drag(
from: { x: number; y: number } | undefined,
to: { x: number; y: number },
): Promise<void> {
const input = requireComputerUseInput()
if (from !== undefined) {
await moveAndSettle(input, from.x, from.y)
}
await input.mouseButton('left', 'press')
await sleep(MOVE_SETTLE_MS)
try {
await animatedMove(input, to.x, to.y, getMouseAnimationEnabled())
} finally {
await input.mouseButton('left', 'release')
}
},
/**
* Move first, then scroll each axis. Vertical-first β it's the common
* axis; a horizontal failure shouldn't lose the vertical.
*/
async scroll(x: number, y: number, dx: number, dy: number): Promise<void> {
const input = requireComputerUseInput()
await moveAndSettle(input, x, y)
if (dy !== 0) {
await input.mouseScroll(dy, 'vertical')
}
if (dx !== 0) {
await input.mouseScroll(dx, 'horizontal')
}
},
// ββ App management βββββββββββββββββββββββββββββββββββββββββββββββββββ
async getFrontmostApp(): Promise<FrontmostApp | null> {
const info = requireComputerUseInput().getFrontmostAppInfo()
if (!info || !info.bundleId) return null
return { bundleId: info.bundleId, displayName: info.appName }
},
async appUnderPoint(
x: number,
y: number,
): Promise<{ bundleId: string; displayName: string } | null> {
return cu.apps.appUnderPoint(x, y)
},
async listInstalledApps(): Promise<InstalledApp[]> {
// `ComputerUseInstalledApp` is `{bundleId, displayName, path}`.
// `InstalledApp` adds optional `iconDataUrl` β left unpopulated;
// the approval dialog fetches lazily via getAppIcon() below.
return drainRunLoop(() => cu.apps.listInstalled())
},
async getAppIcon(path: string): Promise<string | undefined> {
return cu.apps.iconDataUrl(path) ?? undefined
},
async listRunningApps(): Promise<RunningApp[]> {
return cu.apps.listRunning()
},
async openApp(bundleId: string): Promise<void> {
await cu.apps.open(bundleId)
},
}
}
/**
* Module-level export (not on the executor object) β called at turn-end from
* `stopHooks.ts` / `query.ts`, outside the executor lifecycle. Fire-and-forget
* at the call site; the caller `.catch()`es.
*/
export async function unhideComputerUseApps(
bundleIds: readonly string[],
): Promise<void> {
if (bundleIds.length === 0) return
const cu = requireComputerUseSwift()
await cu.apps.unhide([...bundleIds])
}