#!/bin/bash # Simple bash-script for searching raw HTML sources # Uses Unix's Commons: awk, grep, head, tail # Will run for cygWin (winOS) as well, but make sure # you've got the path right to your archive (arcDir) # Local archive folder # --> Edit here for your own local path arcDir="/pathto/archives/8chan/qresearch/.zfs/snapshot/grab-n-snap-20190713-0227/qresearch/res" # For CygWin, path could be for example: # arcDir="/cygdrive/D/archive/pol/res" # SearchTerm provided by user -- make sure it's there and not too short srcTerm="${1}" if [ "${srcTerm}" == "" ]; then echo "Script searches for expressions in a HTML-archive" echo "No search term given -- script will exit. Next time" echo "try e.g.: ./searchArchive \"puzzle ever\"" exit else if [ $(echo -n "${srcTerm}" | wc | awk '{print $3}') -eq 1 ]; then echo "# Search term is 1 character long -- that's not smart..." echo "# Script refuses to search for \"${srcTerm}\"" exit elif [ $(echo -n "${srcTerm}" | wc | awk '{print $3}') -le 3 ]; then echo "# Warning! Length of search term is ls ${arcDir}/*.html 2>/dev/null | wc -l if [ ${nf} -eq 0 ]; then echo "# Error! No HTML-files found in \"${arcDir}\"" echo "# Please check if archivePath (\"arcDir=...\") is set correct." exit fi echo "# Searching for \"${srcTerm}\" in \"${arcDir}\" (${nf} files)" # TempFile and store all HTML-files in there tmp="tmp.log" ls ${arcDir}/*.html > ${tmp} # Make some substitutions in searchTerm, like escaping spaces & "." srcstr=echo "${srcTerm}" | sed 's/\ /\\\ /g' | sed 's/\./\\\./g' # Read tempFile line by line while read ifile; do # Count number of occurrences in current source file n=cat "${ifile}" | grep -iob "${srcstr}" | wc -l # If count is > 0, list occurrences if [ ${n} -gt 0 ]; then echo ">${n} occurrences in \"${ifile}\":" j=0 for off in cat "${ifile}" | grep -iob "${srcstr}" | awk -F":" '{print $1+1}' | tr '\n' ' '; do j=expr $j + 1 echo -n " (${j}) " tail -c+${off} "${ifile}" | head -c640 echo "" done fi done < ${tmp} # Clean up rm -f ${tmp}