string-strip-html examples

Table of Contents

Quick Take

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

assert.equal(
  stripHtml(`Some text <b>and</b> text.`).result,
  `Some text and text.`
);

// prevents accidental string concatenation
assert.equal(
  stripHtml(`aaa<div>bbb</div>ccc`).result,
  `aaa bbb ccc`
);

// tag pairs with content, upon request
assert.equal(
  stripHtml(`a <pre><code>void a;</code></pre> b`, {
    stripTogetherWithTheirContents: [
      "script", // default
      "style", // default
      "xml", // default
      "pre", // <-- custom-added
    ],
  }).result,
  `a b`
);

// detects raw, legit brackets:
assert.equal(
  stripHtml(`a < b and c > d`).result,
  `a < b and c > d`
);

A Bypass Callback and a Do-Nothing Callback

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

// this callback just pushes proposed result to "rangesArr",
// that's what gets used in the result calculation:
const cb1 = ({
  tag,
  deleteFrom,
  deleteTo,
  insert,
  rangesArr,
  proposedReturn,
}) => {
  rangesArr.push(deleteFrom, deleteTo, insert);
};
const result1 = stripHtml("abc<hr>def", {
  cb: cb1,
}).result;
assert.equal(result1, `abc def`);

// to prove it works, don't do anything:
const cb2 = ({
  tag,
  deleteFrom,
  deleteTo,
  insert,
  rangesArr,
  proposedReturn,
}) => {
  // nothing here πŸ™ˆ
};
const result2 = stripHtml("abc<hr>def", {
  cb: cb2,
}).result;
assert.equal(result2, "abc<hr>def");

Extract HTML <head> contents

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>
    <meta charset="utf-8">
    <title>the title</title>
  </head>
  <body>
    the content
  </body>
</html>`;

// The task asks not to include <head...> and </head>.
// First, extract head tag-to-head tag, including contents
const headWithHeadTags = stripHtml(someHtml, {
  onlyStripTags: ["head"],
  stripTogetherWithTheirContents: ["head"],
})
  .filteredTagLocations.reduce(
    (acc, [from, to]) =>
      `${acc}${someHtml.slice(from, to)}`,
    ""
  )
  .trim();

assert.equal(
  headWithHeadTags,
  `<head>
    <meta charset="utf-8">
    <title>the title</title>
  </head>`
);

const headContents = headWithHeadTags
  .replace(/<\/?head>/g, "")
  .trim();
assert.equal(
  headContents,
  `<meta charset="utf-8">
    <title>the title</title>`
);

Just deletes inline tags

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `This has an <b>un</b>bold word.`;

// default behaviour:
assert.equal(
  stripHtml(someHtml).result,
  `This has an un bold word.`
);

// let's tackle inline tags:
assert.equal(
  stripHtml(someHtml, {
    cb: ({
      tag,
      deleteFrom,
      deleteTo,
      insert,
      rangesArr,
    }) => {
      if (["b", "strong"].includes(tag.name)) {
        rangesArr.push(
          tag.lastOpeningBracketAt,
          tag.lastClosingBracketAt + 1
        );
      } else {
        rangesArr.push(deleteFrom, deleteTo, insert);
      }
    },
  }).result,
  `This has an unbold word.`
);
import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `<a href="https://twitter.com/loretparisi">twitter:loretparisi&nbsp;&oslash;</a>`;

assert.equal(
  stripHtml(someHtml, {
    skipHtmlDecoding: true,
  }).result,
  `twitter:loretparisi&nbsp;&oslash;`
);

assert.equal(
  stripHtml(someHtml, {
    skipHtmlDecoding: true,
    cb: ({
      tag,
      deleteFrom,
      deleteTo,
      insert,
      rangesArr,
      proposedReturn,
    }) => {
      let temp;
      if (
        tag.name === "a" &&
        tag.attributes &&
        tag.attributes.some((attr) => {
          if (attr.name === "href") {
            temp = attr.value;
            return true;
          }
        })
      ) {
        rangesArr.push([
          deleteFrom,
          deleteTo,
          `${temp} ${insert}`,
        ]);
      } else {
        rangesArr.push(proposedReturn);
      }
    },
  }).result,
  `https://twitter.com/loretparisi twitter:loretparisi&nbsp;&oslash;`
);

Leave only HTML

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>
    <meta charset="utf-8">
    <title></title>
  </head>
  <body>
    <h1>Title</h1>
    Some text.
  </body>
</html>`;

assert.equal(
  stripHtml(someHtml).allTagLocations.reduce(
    (acc, [from, to]) =>
      `${acc}${someHtml.slice(from, to)}`,
    ""
  ),
  `<!DOCTYPE html><html lang="en" dir="ltr"><head><meta charset="utf-8"><title></title></head><body><h1></h1></body></html>`
);

Leave only opening td tags

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `<table width="100" border="0" cellpadding="0" cellspacing="0">
  <tr>
    <td class="col1">
      cell1
    </td>
    <td class="col2">
      cell2
    </td>
  </tr>
  <tr>
    <td class="col3">
      cell3
    </td>
    <td class="col4">
      cell4
    </td>
  </tr>
</table>`;

// the first way
// -----------------------------------------------------------------------------

assert.equal(
  stripHtml(someHtml, {
    // notice there's no: onlyStripTags: ["td"]
    // we operate purely via callback
    cb: ({
      tag,
      deleteFrom,
      deleteTo,
      insert,
      rangesArr,
      proposedReturn,
    }) => {
      if (tag.name === "td" && !tag.slashPresent) {
        rangesArr.push(proposedReturn);
      }
    },
  }).ranges.reduce(
    (acc, [from, to]) =>
      `${acc}${someHtml.slice(from, to).trim()}`,
    ""
  ),
  `<td class="col1"><td class="col2"><td class="col3"><td class="col4">`
);

// the second way:
// -----------------------------------------------------------------------------

let resultStr = "";
// notice we don't even assign stripHtml() output to anything - we rely only
// on the callback, it mutates the "resultStr" in the upper scope
stripHtml(someHtml, {
  // notice there's no: onlyStripTags: ["td"]
  // we operate purely via callback
  cb: ({
    tag,
    deleteFrom,
    deleteTo,
    insert,
    rangesArr,
    proposedReturn,
  }) => {
    if (tag.name === "td" && !tag.slashPresent) {
      resultStr += someHtml
        .slice(deleteFrom, deleteTo)
        .trim();
    }
  },
});
assert.equal(
  resultStr,
  `<td class="col1"><td class="col2"><td class="col3"><td class="col4">`
);

Leave only td tags

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `<table width="100" border="0" cellpadding="0" cellspacing="0">
  <tr>
    <td class="col1">
      cell1
    </td>
    <td class="col2">
      cell2
    </td>
  </tr>
  <tr>
    <td class="col3">
      cell3
    </td>
    <td class="col4">
      cell4
    </td>
  </tr>
</table>`;

assert.equal(
  stripHtml(someHtml, {
    onlyStripTags: ["td"],
  }).filteredTagLocations.reduce(
    (acc, [from, to]) =>
      `${acc}${someHtml.slice(from, to)}`,
    ""
  ),
  `<td class="col1"></td><td class="col2"></td><td class="col3"></td><td class="col4"></td>`
);

Minimal example using Ranges

// We strip tags and fix apostrophes
// that's part of what https://codsen.com/os/detergent/ does

import { strict as assert } from "assert";
import { rApply } from "ranges-apply";
import { stripHtml } from "string-strip-html";
import { convertAll } from "string-apostrophes";

function stripAndFixApos(str) {
  if (!str || typeof str !== "string") {
    return "";
  }
  // Keep in mind, Ranges are array of 2-3 element arrays.
  // But absent Ranges are marked as null, not empty array.
  // It's so that we could test in "if-else" easily - null
  // is falsy but empty array is truthy.
  // That's why below we take precautions with "|| []".
  return rApply(
    str,
    (stripHtml(str).ranges || []).concat(
      convertAll(str).ranges || []
    )
  );
}

// strips tags and fixes apostrophes:
assert.equal(
  stripAndFixApos(`Let's Go <strong>Larval</strong>`),
  `Let’s Go Larval`
);

// no tags, no apostrophes:
assert.equal(stripAndFixApos(`zzz`), `zzz`);

Remove all HTML from a string

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";

const someHtml = `<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>
    <meta charset="utf-8">
    <title></title>
  </head>
  <body>
    <h1>Title</h1>
    Some text.
  </body>
</html>`;

assert.equal(
  stripHtml(someHtml).result,
  `Title\nSome text.`
);

Strip HTML from a raw JSON string

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";
import { traverse } from "ast-monkey-traverse";

const stripFromJsonStr = (str) => {
  return traverse(JSON.parse(str), (key, val) => {
    // if currently an object is traversed, you get both "key" and "val"
    // if it's array, only "key" is present, "val" is undefined
    const current = val !== undefined ? val : key;
    if (
      // ensure it's a plain object, not array (monkey will report only "key" in
      // arrays and "val" will be undefined)
      // also ensure object's value a string, not boolean or number, because we
      // don't strip HTML from booleans or numbers or anything else than strings
      typeof val === "string"
    ) {
      // monkey's callback is like Array.map - whatever you return gets written:
      return stripHtml(val).result;
    }
    // default return, do nothing:
    return current;
  });
};

// nothing to strip, "<" is false alarm:
assert.equal(
  JSON.stringify(
    stripFromJsonStr(
      `{"Operator":"<","IsValid":true}`
    ),
    null,
    0
  ),
  `{"Operator":"<","IsValid":true}`
);

// some HTML within one of key values, monkey will skip the boolean:
assert.equal(
  JSON.stringify(
    stripFromJsonStr(
      `{"Operator":"a <div>b</div> c","IsValid":true}`
    ),
    null,
    0
  ),
  `{"Operator":"a b c","IsValid":true}`
);

Set the title case using title package

// This program will not touch any single tags (<br class="z"/> for example)
// or in case of paired tags, paired tags and content between

import { strict as assert } from "assert";
import title from "title";
import { rInvert } from "ranges-invert";
import { rApply } from "ranges-apply";
import { rRegex } from "ranges-regex";
import { stripHtml } from "string-strip-html";

function tagAwareTitle(str) {
  const whitelist = ["eslint", "readme", "npm"];
  const { filteredTagLocations } = stripHtml(str, {
    stripTogetherWithTheirContents: ["*"],
  });
  const inverted = rInvert(
    filteredTagLocations.concat(
      whitelist.reduce((acc, curr) => {
        const rangesFindings = rRegex(
          new RegExp(curr, "gi"),
          str
        );
        if (rangesFindings) {
          return acc.concat(rangesFindings);
        }
        return acc;
      }, [])
    ),
    str.length
  );

  if (Array.isArray(inverted) && inverted.length) {
    // take inverted ranges, for example, [[3, 4], [10, 15]]
    // and add third element, replacement, which is same character
    // indexes only processed through "title":
    return rApply(
      str,
      inverted.map(([from, to]) => [
        from,
        to,
        title(str.slice(from, to)),
      ])
    );
  }
  // otherwise, just apply title() on the whole string:
  return title(str);
}

// middle:
assert.equal(
  tagAwareTitle(
    `This is a title with some <code>code</code> in it`
  ),
  `This Is a Title with Some <code>code</code> In It`
);

// leading:
assert.equal(
  tagAwareTitle(
    `<span class="xyz">abc<span> defgh ESLint`
  ),
  `<span class="xyz">abc<span> Defgh ESLint`
);

Widow word removal from text within HTML

import { strict as assert } from "assert";
import { stripHtml } from "string-strip-html";
import { removeWidows } from "string-remove-widows";

const someHtml = `The quick brown fox jumps of the lazy dog.<div class="a">`;

// default widow word removal libs are not aware of HTML:
// -----------------------------------------------------------------------------

assert.equal(
  removeWidows(someHtml).res,
  `The quick brown fox jumps of the lazy dog.<div&nbsp;class="a">` // 😱
);

// luckily, removeWidows() consumes optional HTML tag locations
assert.equal(
  removeWidows(someHtml, {
    tagRanges: stripHtml(someHtml)
      // remove the third argument, what to insert ("&nbsp;" string in these cases)
      .ranges.map(([from, to]) => [from, to]),
  }).res,
  `The quick brown fox jumps of the lazy&nbsp;dog.<div class="a">` // βœ…
);