This commit is contained in:
Ren Amamiya
2023-08-12 11:18:10 +07:00
parent 36b2acba6a
commit bb089bb259
27 changed files with 502 additions and 481 deletions

View File

@@ -19,7 +19,13 @@ interface IPreFetchedResource {
imagesPropertyType?: string;
proxyUrl?: string;
url: string;
data: any;
data: string;
}
function throwOnLoopback(address: string) {
if (OPENGRAPH.REGEX_LOOPBACK.test(address)) {
throw new Error('SSRF request detected, trying to query host');
}
}
function metaTag(doc: cheerio.CheerioAPI, type: string, attr: string) {
@@ -28,42 +34,42 @@ function metaTag(doc: cheerio.CheerioAPI, type: string, attr: string) {
}
function metaTagContent(doc: cheerio.CheerioAPI, type: string, attr: string) {
return doc(`meta[${attr}='${type}']`).attr('content');
return doc(`meta[${attr}='${type}']`).attr(`content`);
}
function getTitle(doc: cheerio.CheerioAPI) {
let title =
metaTagContent(doc, 'og:title', 'property') ||
metaTagContent(doc, 'og:title', 'name');
metaTagContent(doc, `og:title`, `property`) ||
metaTagContent(doc, `og:title`, `name`);
if (!title) {
title = doc('title').text();
title = doc(`title`).text();
}
return title;
}
function getSiteName(doc: cheerio.CheerioAPI) {
const siteName =
metaTagContent(doc, 'og:site_name', 'property') ||
metaTagContent(doc, 'og:site_name', 'name');
metaTagContent(doc, `og:site_name`, `property`) ||
metaTagContent(doc, `og:site_name`, `name`);
return siteName;
}
function getDescription(doc: cheerio.CheerioAPI) {
const description =
metaTagContent(doc, 'description', 'name') ||
metaTagContent(doc, 'Description', 'name') ||
metaTagContent(doc, 'og:description', 'property');
metaTagContent(doc, `description`, `name`) ||
metaTagContent(doc, `Description`, `name`) ||
metaTagContent(doc, `og:description`, `property`);
return description;
}
function getMediaType(doc: cheerio.CheerioAPI) {
const node = metaTag(doc, 'medium', 'name');
const node = metaTag(doc, `medium`, `name`);
if (node) {
const content = node.attr('content');
return content === 'image' ? 'photo' : content;
const content = node.attr(`content`);
return content === `image` ? `photo` : content;
}
return (
metaTagContent(doc, 'og:type', 'property') || metaTagContent(doc, 'og:type', 'name')
metaTagContent(doc, `og:type`, `property`) || metaTagContent(doc, `og:type`, `name`)
);
}
@@ -77,14 +83,14 @@ function getImages(
let src: string | undefined;
let dic: Record<string, boolean> = {};
const imagePropertyType = imagesPropertyType ?? 'og';
const imagePropertyType = imagesPropertyType ?? `og`;
nodes =
metaTag(doc, `${imagePropertyType}:image`, 'property') ||
metaTag(doc, `${imagePropertyType}:image`, 'name');
metaTag(doc, `${imagePropertyType}:image`, `property`) ||
metaTag(doc, `${imagePropertyType}:image`, `name`);
if (nodes) {
nodes.each((_: number, node: cheerio.Element) => {
if (node.type === 'tag') {
if (node.type === `tag`) {
src = node.attribs.content;
if (src) {
src = new URL(src, rootUrl).href;
@@ -95,18 +101,18 @@ function getImages(
}
if (images.length <= 0 && !imagesPropertyType) {
src = doc('link[rel=image_src]').attr('href');
src = doc(`link[rel=image_src]`).attr(`href`);
if (src) {
src = new URL(src, rootUrl).href;
images = [src];
} else {
nodes = doc('img');
nodes = doc(`img`);
if (nodes?.length) {
dic = {};
images = [];
nodes.each((_: number, node: cheerio.Element) => {
if (node.type === 'tag') src = node.attribs.src;
if (node.type === `tag`) src = node.attribs.src;
if (src && !dic[src]) {
dic[src] = true;
// width = node.attribs.width;
@@ -135,32 +141,32 @@ function getVideos(doc: cheerio.CheerioAPI) {
let videoObj;
let index;
const nodes = metaTag(doc, 'og:video', 'property') || metaTag(doc, 'og:video', 'name');
const nodes = metaTag(doc, `og:video`, `property`) || metaTag(doc, `og:video`, `name`);
if (nodes?.length) {
nodeTypes =
metaTag(doc, 'og:video:type', 'property') || metaTag(doc, 'og:video:type', 'name');
metaTag(doc, `og:video:type`, `property`) || metaTag(doc, `og:video:type`, `name`);
nodeSecureUrls =
metaTag(doc, 'og:video:secure_url', 'property') ||
metaTag(doc, 'og:video:secure_url', 'name');
metaTag(doc, `og:video:secure_url`, `property`) ||
metaTag(doc, `og:video:secure_url`, `name`);
width =
metaTagContent(doc, 'og:video:width', 'property') ||
metaTagContent(doc, 'og:video:width', 'name');
metaTagContent(doc, `og:video:width`, `property`) ||
metaTagContent(doc, `og:video:width`, `name`);
height =
metaTagContent(doc, 'og:video:height', 'property') ||
metaTagContent(doc, 'og:video:height', 'name');
metaTagContent(doc, `og:video:height`, `property`) ||
metaTagContent(doc, `og:video:height`, `name`);
for (index = 0; index < nodes.length; index += 1) {
const node = nodes[index];
if (node.type === 'tag') video = node.attribs.content;
if (node.type === `tag`) video = node.attribs.content;
nodeType = nodeTypes?.[index];
if (nodeType?.type === 'tag') {
if (nodeType?.type === `tag`) {
videoType = nodeType ? nodeType.attribs.content : null;
}
nodeSecureUrl = nodeSecureUrls?.[index];
if (nodeSecureUrl?.type === 'tag') {
if (nodeSecureUrl?.type === `tag`) {
videoSecureUrl = nodeSecureUrl ? nodeSecureUrl.attribs.content : null;
}
@@ -171,7 +177,7 @@ function getVideos(doc: cheerio.CheerioAPI) {
width,
height,
};
if (videoType && videoType.indexOf('video/') === 0) {
if (videoType && videoType.indexOf(`video/`) === 0) {
videos.splice(0, 0, videoObj);
} else {
videos.push(videoObj);
@@ -193,7 +199,7 @@ function getFavicons(doc: cheerio.CheerioAPI, rootUrl: string) {
let nodes: cheerio.Cheerio<cheerio.Element> | never[] = [];
let src: string | undefined;
const relSelectors = ['rel=icon', `rel="shortcut icon"`, 'rel=apple-touch-icon'];
const relSelectors = [`rel=icon`, `rel="shortcut icon"`, `rel=apple-touch-icon`];
relSelectors.forEach((relSelector) => {
// look for all icon tags
@@ -202,9 +208,9 @@ function getFavicons(doc: cheerio.CheerioAPI, rootUrl: string) {
// collect all images from icon tags
if (nodes.length) {
nodes.each((_: number, node: cheerio.Element) => {
if (node.type === 'tag') src = node.attribs.href;
if (node.type === `tag`) src = node.attribs.href;
if (src) {
src = new URL(rootUrl).href;
src = new URL(src, rootUrl).href;
images.push(src);
}
});
@@ -222,7 +228,7 @@ function getFavicons(doc: cheerio.CheerioAPI, rootUrl: string) {
function parseImageResponse(url: string, contentType: string) {
return {
url,
mediaType: 'image',
mediaType: `image`,
contentType,
favicons: [getDefaultFavicon(url)],
};
@@ -231,7 +237,7 @@ function parseImageResponse(url: string, contentType: string) {
function parseAudioResponse(url: string, contentType: string) {
return {
url,
mediaType: 'audio',
mediaType: `audio`,
contentType,
favicons: [getDefaultFavicon(url)],
};
@@ -240,7 +246,7 @@ function parseAudioResponse(url: string, contentType: string) {
function parseVideoResponse(url: string, contentType: string) {
return {
url,
mediaType: 'video',
mediaType: `video`,
contentType,
favicons: [getDefaultFavicon(url)],
};
@@ -249,7 +255,7 @@ function parseVideoResponse(url: string, contentType: string) {
function parseApplicationResponse(url: string, contentType: string) {
return {
url,
mediaType: 'application',
mediaType: `application`,
contentType,
favicons: [getDefaultFavicon(url)],
};
@@ -268,7 +274,7 @@ function parseTextResponse(
title: getTitle(doc),
siteName: getSiteName(doc),
description: getDescription(doc),
mediaType: getMediaType(doc) || 'website',
mediaType: getMediaType(doc) || `website`,
contentType,
images: getImages(doc, url, options.imagesPropertyType),
videos: getVideos(doc),
@@ -287,11 +293,11 @@ function parseUnknownResponse(
function parseResponse(response: IPreFetchedResource, options?: ILinkPreviewOptions) {
try {
let contentType = response.headers['content-type'];
let contentType = response.headers[`content-type`];
// console.warn(`original content type`, contentType);
if (contentType?.indexOf(';')) {
if (contentType?.indexOf(`;`)) {
// eslint-disable-next-line prefer-destructuring
contentType = contentType.split(';')[0];
contentType = contentType.split(`;`)[0];
// console.warn(`splitting content type`, contentType);
}
@@ -330,19 +336,117 @@ function parseResponse(response: IPreFetchedResource, options?: ILinkPreviewOpti
}
}
export async function getLinkPreview(text: string) {
const fetchUrl = text;
const options = {
method: 'GET',
timeout: 5,
};
let response = await fetch(fetchUrl, options);
if (response.status > 300 && response.status < 309) {
const forwardedUrl = response.headers.location || '';
response = await fetch(forwardedUrl, options);
/**
* Parses the text, extracts the first link it finds and does a HTTP request
* to fetch the website content, afterwards it tries to parse the internal HTML
* and extract the information via meta tags
* @param text string, text to be parsed
* @param options ILinkPreviewOptions
*/
export async function getLinkPreview(text: string, options?: ILinkPreviewOptions) {
if (!text || typeof text !== `string`) {
throw new Error(`link-preview-js did not receive a valid url or text`);
}
return parseResponse(response);
const detectedUrl = text
.replace(/\n/g, ` `)
.split(` `)
.find((token) => OPENGRAPH.REGEX_VALID_URL.test(token));
if (!detectedUrl) {
throw new Error(`link-preview-js did not receive a valid a url or text`);
}
if (options?.followRedirects === `manual` && !options?.handleRedirects) {
throw new Error(
`link-preview-js followRedirects is set to manual, but no handleRedirects function was provided`
);
}
if (options?.resolveDNSHost) {
const resolvedUrl = await options.resolveDNSHost(detectedUrl);
throwOnLoopback(resolvedUrl);
}
const timeout = options?.timeout ?? 3000; // 3 second timeout default
const controller = new AbortController();
const timeoutCounter = setTimeout(() => controller.abort(), timeout);
const fetchOptions = {
headers: options?.headers ?? {},
redirect: options?.followRedirects ?? `error`,
signal: controller.signal,
};
const fetchUrl = options?.proxyUrl ? options.proxyUrl.concat(detectedUrl) : detectedUrl;
// Seems like fetchOptions type definition is out of date
// https://github.com/node-fetch/node-fetch/issues/741
let response = await fetch(fetchUrl, fetchOptions as any).catch((e) => {
if (e.name === `AbortError`) {
throw new Error(`Request timeout`);
}
clearTimeout(timeoutCounter);
throw e;
});
if (
response.status > 300 &&
response.status < 309 &&
fetchOptions.redirect === `manual` &&
options?.handleRedirects
) {
const forwardedUrl = response.headers.get(`location`) || ``;
if (!options.handleRedirects(fetchUrl, forwardedUrl)) {
throw new Error(`link-preview-js could not handle redirect`);
}
if (options?.resolveDNSHost) {
const resolvedUrl = await options.resolveDNSHost(forwardedUrl);
throwOnLoopback(resolvedUrl);
}
response = await fetch(forwardedUrl, fetchOptions as any);
}
clearTimeout(timeoutCounter);
const headers: Record<string, string> = {};
response.headers.forEach((header, key) => {
headers[key] = header;
});
const normalizedResponse: IPreFetchedResource = {
url: options?.proxyUrl ? response.url.replace(options.proxyUrl, ``) : response.url,
headers,
data: await response.text(),
};
return parseResponse(normalizedResponse, options);
}
/**
* Skip the library fetching the website for you, instead pass a response object
* from whatever source you get and use the internal parsing of the HTML to return
* the necessary information
* @param response Preview Response
* @param options IPreviewLinkOptions
*/
export async function getPreviewFromContent(
response: IPreFetchedResource,
options?: ILinkPreviewOptions
) {
if (!response || typeof response !== `object`) {
throw new Error(`link-preview-js did not receive a valid response object`);
}
if (!response.url) {
throw new Error(`link-preview-js did not receive a valid response object`);
}
return parseResponse(response, options);
}