summaryrefslogtreecommitdiff
path: root/parse_stagit
diff options
context:
space:
mode:
authorKatolaZ <[email protected]>2020-01-08 19:56:48 +0000
committerKatolaZ <[email protected]>2020-01-08 19:56:48 +0000
commitddb926830977e7a4a5fe5b91820bdce34c8826e7 (patch)
tree3b1d187a40c6250342c69f544367534efbdd9031 /parse_stagit
parent3e559a8ecad2c03880c1105a729b69faaa2eb362 (diff)
refactor parse_stagit -- get author and description
Diffstat (limited to 'parse_stagit')
-rwxr-xr-xparse_stagit80
1 files changed, 58 insertions, 22 deletions
diff --git a/parse_stagit b/parse_stagit
index 39feda2..be52ee3 100755
--- a/parse_stagit
+++ b/parse_stagit
@@ -12,51 +12,87 @@ fi
FIN="/dev/stdin"
URLBASE="$1"
DEST="$2"
-
SUBPATH="/file"
-
CURL="torify curl -Ls "
-PROTO=${URLBASE%%:\/\/*}
-DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
-echo "proto: $PROTO"
-echo "dirbase: $DIRBASE"
-READMES="README.html README.txt.html README.md.html readme.html readme.txt.html readme.md.html"
+## func
+cleanup () {
+ rm -f $TMPFILE
+ exit
+}
+## func
+get_repos() {
-repos=$($CURL "${URLBASE}" | xml2tsv | grep "/html/body/div/table/tbody/tr/td/a" \
+repos=$(cat $TMPFILE | grep "/html/body/div/table/tbody/tr/td/a" \
| awk '{print $(NF-1), $NF}' \
| sed -E 's/href=//g;s/ /\|/'\
)
+}
+## func
+get_descr() {
+ reponame=$1
+ echo "reponame: $reponame"
+ ## Get author and description
+ cat "$TMPFILE" | grep -Ei -A 2 "/html/body/div/table/tbody/tr/td/a[[:blank:]]+href=${reponame}/log.html" |\
+ tail -2 | cut -f 2-
+
+}
-for r in $repos; do
- name=$(echo "$r" | cut -d "|" -f 2 )
- link=$(echo "$r" | cut -d "|" -f 1 )
- link="${link%%log.html}"
- baselink=$(printf "%s/%s" $URLBASE $link)
- printf "link: %s\nbaselink: %s\n" $link $baselink 1>&2
- REPODIR="$DEST/$DIRBASE/$link/"
- mkdir -p $REPODIR
+## func
+get_readmes() {
+ LINK=$1
+ DESTDIR=$2
for f in $READMES; do
- printf " trying file %s..." $baselink/$SUBPATH/$f
- $CURL "$baselink/$SUBPATH/$f" > $REPODIR/$f.tmp
- failure=$(xml2tsv < $REPODIR/$f.tmp 2>/dev/null | \
+ printf " trying file %s..." $LINK/$f
+ $CURL "$LINK/$f" > $DESTDIR/$f.tmp
+ failure=$(xml2tsv < $DESTDIR/$f.tmp 2>/dev/null | \
grep -Eaic "^/html/head/title[[:blank:]]+404 Not Found")
echo $failure
if [ "$failure" = 1 ]; then
printf "[FAILED]\n"
else
- xml2tsv < $REPODIR/$f.tmp 2>/dev/null | \
+ xml2tsv < $DESTDIR/$f.tmp 2>/dev/null | \
grep -Eai "/html/body/div/pre/a[[:blank:]]+href=#.*[[:blank:]]+class=line" | \
cut -f 6- | \
- sed -E 's/\\n//g;s/\\t/\t/g;s/\\\\/\\/g' > $REPODIR/$f
+ sed -E 's/\\n//g;s/\\t/\t/g;s/\\\\/\\/g' > $DESTDIR/$f
printf "[OK]\n"
fi
- rm -f $REPODIR/$f.tmp
+ rm -f $DESTDIR/$f.tmp
sleep 1
done
+}
+
+# main loop
+
+PROTO=${URLBASE%%:\/\/*}
+DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
+echo "proto: $PROTO"
+echo "dirbase: $DIRBASE"
+
+READMES="README.html README.txt.html README.md.html readme.html readme.txt.html readme.md.html"
+TMPFILE="./tmp_$$"
+
+trap cleanup EXIT KILL TERM INT
+
+$CURL "${URLBASE}" | xml2tsv > $TMPFILE
+
+get_repos
+
+for r in $repos; do
+ name=$(echo "$r" | cut -d "|" -f 2 )
+ link=$(echo "$r" | cut -d "|" -f 1 )
+ link="${link%%log.html}"
+ baselink=$(printf "%s/%s" $URLBASE $link)
+ printf "link: %s\nbaselink: %s\n" $link $baselink 1>&2
+ REPODIR="$DEST/$DIRBASE/$link/"
+ mkdir -p $REPODIR
+ get_descr $name > ${REPODIR}/DESCR
+ ## Get READMEs
+ get_readmes "$baselink/$SUBPATH" "$REPODIR"
done
+cleanup