π File detail
tools/WebFetchTool/utils.ts
π― Use case
This module implements the βWebFetchToolβ tool (Web Fetch) β something the model can call at runtime alongside other agent tools. On the API surface it exposes clearWebFetchCache, MAX_MARKDOWN_LENGTH, isPreapprovedUrl, validateURL, and checkDomainBlocklist (and more) β mainly functions, hooks, or classes. Dependencies touch HTTP client and lru-cache. It composes internal code from services, preapproved, and prompt (relative imports).
Generated from folder role, exports, dependency roots, and inline comments β not hand-reviewed for every path.
π§ Inline summary
import axios, { type AxiosResponse } from 'axios' import { LRUCache } from 'lru-cache' import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent,
π€ Exports (heuristic)
clearWebFetchCacheMAX_MARKDOWN_LENGTHisPreapprovedUrlvalidateURLcheckDomainBlocklistisPermittedRedirectgetWithPermittedRedirectsFetchedContentgetURLMarkdownContentapplyPromptToMarkdown
π External import roots
Package roots from from "β¦" (relative paths omitted).
axioslru-cache
π₯οΈ Source preview
import axios, { type AxiosResponse } from 'axios'
import { LRUCache } from 'lru-cache'
import {
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
logEvent,
} from '../../services/analytics/index.js'
import { queryHaiku } from '../../services/api/claude.js'
import { AbortError } from '../../utils/errors.js'
import { getWebFetchUserAgent } from '../../utils/http.js'
import { logError } from '../../utils/log.js'
import {
isBinaryContentType,
persistBinaryContent,
} from '../../utils/mcpOutputStorage.js'
import { getSettings_DEPRECATED } from '../../utils/settings/settings.js'
import { asSystemPrompt } from '../../utils/systemPromptType.js'
import { isPreapprovedHost } from './preapproved.js'
import { makeSecondaryModelPrompt } from './prompt.js'
// Custom error classes for domain blocking
class DomainBlockedError extends Error {
constructor(domain: string) {
super(`Claude Code is unable to fetch from ${domain}`)
this.name = 'DomainBlockedError'
}
}
class DomainCheckFailedError extends Error {
constructor(domain: string) {
super(
`Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`,
)
this.name = 'DomainCheckFailedError'
}
}
class EgressBlockedError extends Error {
constructor(public readonly domain: string) {
super(
JSON.stringify({
error_type: 'EGRESS_BLOCKED',
domain,
message: `Access to ${domain} is blocked by the network egress proxy.`,
}),
)
this.name = 'EgressBlockedError'
}
}
// Cache for storing fetched URL content
type CacheEntry = {
bytes: number
code: number
codeText: string
content: string
contentType: string
persistedPath?: string
persistedSize?: number
}
// Cache with 15-minute TTL and 50MB size limit
// LRUCache handles automatic expiration and eviction
const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes
const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB
const URL_CACHE = new LRUCache<string, CacheEntry>({
maxSize: MAX_CACHE_SIZE_BYTES,
ttl: CACHE_TTL_MS,
})
// Separate cache for preflight domain checks. URL_CACHE is URL-keyed, so
// fetching two paths on the same domain triggers two identical preflight
// HTTP round-trips to api.anthropic.com. This hostname-keyed cache avoids
// that. Only 'allowed' is cached β blocked/failed re-check on next attempt.
const DOMAIN_CHECK_CACHE = new LRUCache<string, true>({
max: 128,
ttl: 5 * 60 * 1000, // 5 minutes β shorter than URL_CACHE TTL
})
export function clearWebFetchCache(): void {
URL_CACHE.clear()
DOMAIN_CHECK_CACHE.clear()
}
// Lazy singleton β defers the turndown β @mixmark-io/domino import (~1.4MB
// retained heap) until the first HTML fetch, and reuses one instance across
// calls (construction builds 15 rule objects; .turndown() is stateless).
// @types/turndown ships only `export =` (no .d.mts), so TS types the import
// as the class itself while Bun wraps CJS in { default } β hence the cast.
type TurndownCtor = typeof import('turndown')
let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined
function getTurndownService(): Promise<InstanceType<TurndownCtor>> {
return (turndownServicePromise ??= import('turndown').then(m => {
const Turndown = (m as unknown as { default: TurndownCtor }).default
return new Turndown()
}))
}
// PSR requested limiting the length of URLs to 250 to lower the potential
// for a data exfiltration. However, this is too restrictive for some customers'
// legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs)
// that can be much longer. We already require user approval for each domain,
// which provides a primary security boundary. In addition, Claude Code has
// other data exfil channels, and this one does not seem relatively high risk,
// so I'm removing that length restriction. -ab
const MAX_URL_LENGTH = 2000
// Per PSR:
// "Implement resource consumption controls because setting limits on CPU,
// memory, and network usage for the Web Fetch tool can prevent a single
// request or user from overwhelming the system."
const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024
// Timeout for the main HTTP fetch request (60 seconds).
// Prevents hanging indefinitely on slow/unresponsive servers.
const FETCH_TIMEOUT_MS = 60_000
// Timeout for the domain blocklist preflight check (10 seconds).
const DOMAIN_CHECK_TIMEOUT_MS = 10_000
// Cap same-host redirect hops. Without this a malicious server can return
// a redirect loop (/a β /b β /a β¦) and the per-request FETCH_TIMEOUT_MS
// resets on every hop, hanging the tool until user interrupt. 10 matches
// common client defaults (axios=5, follow-redirects=21, Chrome=20).
const MAX_REDIRECTS = 10
// Truncate to not spend too many tokens
export const MAX_MARKDOWN_LENGTH = 100_000
export function isPreapprovedUrl(url: string): boolean {
try {
const parsedUrl = new URL(url)
return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname)
} catch {
return false
}
}
export function validateURL(url: string): boolean {
if (url.length > MAX_URL_LENGTH) {
return false
}
let parsed
try {
parsed = new URL(url)
} catch {
return false
}
// We don't need to check protocol here, as we'll upgrade http to https when making the request
// As long as we aren't supporting aiming to cookies or internal domains,
// we should block URLs with usernames/passwords too, even though these
// seem exceedingly unlikely.
if (parsed.username || parsed.password) {
return false
}
// Initial filter that this isn't a privileged, company-internal URL
// by checking that the hostname is publicly resolvable
const hostname = parsed.hostname
const parts = hostname.split('.')
if (parts.length < 2) {
return false
}
return true
}
type DomainCheckResult =
| { status: 'allowed' }
| { status: 'blocked' }
| { status: 'check_failed'; error: Error }
export async function checkDomainBlocklist(
domain: string,
): Promise<DomainCheckResult> {
if (DOMAIN_CHECK_CACHE.has(domain)) {
return { status: 'allowed' }
}
try {
const response = await axios.get(
`https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`,
{ timeout: DOMAIN_CHECK_TIMEOUT_MS },
)
if (response.status === 200) {
if (response.data.can_fetch === true) {
DOMAIN_CHECK_CACHE.set(domain, true)
return { status: 'allowed' }
}
return { status: 'blocked' }
}
// Non-200 status but didn't throw
return {
status: 'check_failed',
error: new Error(`Domain check returned status ${response.status}`),
}
} catch (e) {
logError(e)
return { status: 'check_failed', error: e as Error }
}
}
/**
* Check if a redirect is safe to follow
* Allows redirects that:
* - Add or remove "www." in the hostname
* - Keep the origin the same but change path/query params
* - Or both of the above
*/
export function isPermittedRedirect(
originalUrl: string,
redirectUrl: string,
): boolean {
try {
const parsedOriginal = new URL(originalUrl)
const parsedRedirect = new URL(redirectUrl)
if (parsedRedirect.protocol !== parsedOriginal.protocol) {
return false
}
if (parsedRedirect.port !== parsedOriginal.port) {
return false
}
if (parsedRedirect.username || parsedRedirect.password) {
return false
}
// Now check hostname conditions
// 1. Adding www. is allowed: example.com -> www.example.com
// 2. Removing www. is allowed: www.example.com -> example.com
// 3. Same host (with or without www.) is allowed: paths can change
const stripWww = (hostname: string) => hostname.replace(/^www\./, '')
const originalHostWithoutWww = stripWww(parsedOriginal.hostname)
const redirectHostWithoutWww = stripWww(parsedRedirect.hostname)
return originalHostWithoutWww === redirectHostWithoutWww
} catch (_error) {
return false
}
}
/**
* Helper function to handle fetching URLs with custom redirect handling
* Recursively follows redirects if they pass the redirectChecker function
*
* Per PSR:
* "Do not automatically follow redirects because following redirects could
* allow for an attacker to exploit an open redirect vulnerability in a
* trusted domain to force a user to make a request to a malicious domain
* unknowingly"
*/
type RedirectInfo = {
type: 'redirect'
originalUrl: string
redirectUrl: string
statusCode: number
}
export async function getWithPermittedRedirects(
url: string,
signal: AbortSignal,
redirectChecker: (originalUrl: string, redirectUrl: string) => boolean,
depth = 0,
): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> {
if (depth > MAX_REDIRECTS) {
throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`)
}
try {
return await axios.get(url, {
signal,
timeout: FETCH_TIMEOUT_MS,
maxRedirects: 0,
responseType: 'arraybuffer',
maxContentLength: MAX_HTTP_CONTENT_LENGTH,
headers: {
Accept: 'text/markdown, text/html, */*',
'User-Agent': getWebFetchUserAgent(),
},
})
} catch (error) {
if (
axios.isAxiosError(error) &&
error.response &&
[301, 302, 307, 308].includes(error.response.status)
) {
const redirectLocation = error.response.headers.location
if (!redirectLocation) {
throw new Error('Redirect missing Location header')
}
// Resolve relative URLs against the original URL
const redirectUrl = new URL(redirectLocation, url).toString()
if (redirectChecker(url, redirectUrl)) {
// Recursively follow the permitted redirect
return getWithPermittedRedirects(
redirectUrl,
signal,
redirectChecker,
depth + 1,
)
} else {
// Return redirect information to the caller
return {
type: 'redirect',
originalUrl: url,
redirectUrl,
statusCode: error.response.status,
}
}
}
// Detect egress proxy blocks: the proxy returns 403 with
// X-Proxy-Error: blocked-by-allowlist when egress is restricted
if (
axios.isAxiosError(error) &&
error.response?.status === 403 &&
error.response.headers['x-proxy-error'] === 'blocked-by-allowlist'
) {
const hostname = new URL(url).hostname
throw new EgressBlockedError(hostname)
}
throw error
}
}
function isRedirectInfo(
response: AxiosResponse<ArrayBuffer> | RedirectInfo,
): response is RedirectInfo {
return 'type' in response && response.type === 'redirect'
}
export type FetchedContent = {
content: string
bytes: number
code: number
codeText: string
contentType: string
persistedPath?: string
persistedSize?: number
}
export async function getURLMarkdownContent(
url: string,
abortController: AbortController,
): Promise<FetchedContent | RedirectInfo> {
if (!validateURL(url)) {
throw new Error('Invalid URL')
}
// Check cache (LRUCache handles TTL automatically)
const cachedEntry = URL_CACHE.get(url)
if (cachedEntry) {
return {
bytes: cachedEntry.bytes,
code: cachedEntry.code,
codeText: cachedEntry.codeText,
content: cachedEntry.content,
contentType: cachedEntry.contentType,
persistedPath: cachedEntry.persistedPath,
persistedSize: cachedEntry.persistedSize,
}
}
let parsedUrl: URL
let upgradedUrl = url
try {
parsedUrl = new URL(url)
// Upgrade http to https if needed
if (parsedUrl.protocol === 'http:') {
parsedUrl.protocol = 'https:'
upgradedUrl = parsedUrl.toString()
}
const hostname = parsedUrl.hostname
// Check if the user has opted to skip the blocklist check
// This is for enterprise customers with restrictive security policies
// that prevent outbound connections to claude.ai
const settings = getSettings_DEPRECATED()
if (!settings.skipWebFetchPreflight) {
const checkResult = await checkDomainBlocklist(hostname)
switch (checkResult.status) {
case 'allowed':
// Continue with the fetch
break
case 'blocked':
throw new DomainBlockedError(hostname)
case 'check_failed':
throw new DomainCheckFailedError(hostname)
}
}
if (process.env.USER_TYPE === 'ant') {
logEvent('tengu_web_fetch_host', {
hostname:
hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
})
}
} catch (e) {
if (
e instanceof DomainBlockedError ||
e instanceof DomainCheckFailedError
) {
// Expected user-facing failures - re-throw without logging as internal error
throw e
}
logError(e)
}
const response = await getWithPermittedRedirects(
upgradedUrl,
abortController.signal,
isPermittedRedirect,
)
// Check if we got a redirect response
if (isRedirectInfo(response)) {
return response
}
const rawBuffer = Buffer.from(response.data)
// Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now.
// This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown
// builds its DOM tree (which can be 3-5x the HTML size).
;(response as { data: unknown }).data = null
const contentType = response.headers['content-type'] ?? ''
// Binary content: save raw bytes to disk with a proper extension so Claude
// can inspect the file later. We still fall through to the utf-8 decode +
// Haiku path below β for PDFs in particular the decoded string has enough
// ASCII structure (/Title, text streams) that Haiku can summarize it, and
// the saved file is a supplement rather than a replacement.
let persistedPath: string | undefined
let persistedSize: number | undefined
if (isBinaryContentType(contentType)) {
const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
const result = await persistBinaryContent(rawBuffer, contentType, persistId)
if (!('error' in result)) {
persistedPath = result.filepath
persistedSize = result.size
}
}
const bytes = rawBuffer.length
const htmlContent = rawBuffer.toString('utf-8')
let markdownContent: string
let contentBytes: number
if (contentType.includes('text/html')) {
markdownContent = (await getTurndownService()).turndown(htmlContent)
contentBytes = Buffer.byteLength(markdownContent)
} else {
// It's not HTML - just use it raw. The decoded string's UTF-8 byte
// length equals rawBuffer.length (modulo U+FFFD replacement on invalid
// bytes β negligible for cache eviction accounting), so skip the O(n)
// Buffer.byteLength scan.
markdownContent = htmlContent
contentBytes = bytes
}
// Store the fetched content in cache. Note that it's stored under
// the original URL, not the upgraded or redirected URL.
const entry: CacheEntry = {
bytes,
code: response.status,
codeText: response.statusText,
content: markdownContent,
contentType,
persistedPath,
persistedSize,
}
// lru-cache requires positive integers; clamp to 1 for empty responses.
URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) })
return entry
}
export async function applyPromptToMarkdown(
prompt: string,
markdownContent: string,
signal: AbortSignal,
isNonInteractiveSession: boolean,
isPreapprovedDomain: boolean,
): Promise<string> {
// Truncate content to avoid "Prompt is too long" errors from the secondary model
const truncatedContent =
markdownContent.length > MAX_MARKDOWN_LENGTH
? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) +
'\n\n[Content truncated due to length...]'
: markdownContent
const modelPrompt = makeSecondaryModelPrompt(
truncatedContent,
prompt,
isPreapprovedDomain,
)
const assistantMessage = await queryHaiku({
systemPrompt: asSystemPrompt([]),
userPrompt: modelPrompt,
signal,
options: {
querySource: 'web_fetch_apply',
agents: [],
isNonInteractiveSession,
hasAppendSystemPrompt: false,
mcpTools: [],
},
})
// We need to bubble this up, so that the tool call throws, causing us to return
// an is_error tool_use block to the server, and render a red dot in the UI.
if (signal.aborted) {
throw new AbortError()
}
const { content } = assistantMessage.message
if (content.length > 0) {
const contentBlock = content[0]
if ('text' in contentBlock!) {
return contentBlock.text
}
}
return 'No response from model'
}