Hacker News new | past | comments | ask | show | jobs | submit | textmode's comments login

correction:

sed -n '/pattern/=' file|yy092|sed -nf/dev/stdin file


Correction:

      /* remove HTTP headers from multiple gzip or single zip from stdin */
    
     int fileno (FILE *);
     int setenv (const char *, const char *, int);
     #define jmp (yy_start) = 1 + 2 *
     int x;
    %option nounput noinput noyywrap
    %%
    HTTP\/[\40-\176]+\x0d\x0a x++;
    [\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
    \x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
    %%
    int main()
    { 
    yylex();
    exit(0);
    }

Usage example:

Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.

     ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
     gzip -dc robotstxt.paths.gz \
     |head -5 \
     |sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
           $!s/$/keep-alive[]/;$s/$/close[]/' \
     |tr [] '\r\n' \
     |openssl s_client -quiet -connect data.commoncrawl.org:443 \
     |yy054 \
     |zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
     exec cat 1.txt


Usage example:

Download NetBSD 1.0 in a single TCP connection.

    y="GET /pub/NetBSD-archive/NetBSD-1.0/source/src10/"
    z="Host: archive.netbsd.org"
    sed '$!s>.*>'"$y"'& HTTP/1.1[]'"$z"'[]Connection: keep-alive[]>;
         $s>.*>'"$y"'& HTTP/1.0[]'"$z"'[]>' << eof \
    |tr '[]' '\r\n' \
    |openssl s_client -quiet -connect 151.101.129.6:443 -servername archive.netbsd.org > http+gzip
    src10.aa
    src10.ab
    src10.ac
    src10.ad
    src10.ae
    src10.af
    src10.ag
    src10.ah
    src10.ai
    src10.aj
    src10.ak
    src10.al
    src10.am
    src10.an
    src10.ao
    src10.ap
    src10.aq
    src10.ar
    src10.as
    src10.at
    src10.au
    src10.av
    src10.aw
    src10.ax
    src10.ay
    src10.az
    src10.ba
    src10.bb
    src10.bc
    src10.bd
    src10.be
    src10.bf
    eof

    yy054 < http+gzip|tar tvzf /dev/stdin
Alternate usage:

Include an argv[1] will print HTTP headers only

    yy054 print < http+gzip
    yy054 x < http+gzip


Normally I use yy030 but I have been experimenting with this instead.

Seems to be slightly faster and smaller than similar programs from html-xml-utils.

https://www.w3.org/Tools/HTML-XML-utils/man1/

Compile:

   links -no-connect -dump https://news.ycombinator.com/item?id=38727772 \
   |sed '1,4d;77,$d;s/[ ]\{6\}//' \
   |flex -8Cem;cc -O3 -std=c89 -W -Wall -pipe lex.yy.c -static -o yy044
   strip -s yy044
Example usage:

      # NB. not a real cookie
      curl -H "cookie=user=santa&K7RGzmUtAoKv9OIRMfQ9bfwYpiDEuypp" -siA "" \
      https://news.ycombinator.com \
      |host=news.ycombinator.com/ yy044 r \
      |sed -n 's/&amp;/\&/g;/vote/p'


   /* chunked transfer decoding */
   
    #define echo do{if(fwrite(yytext,(size_t)yyleng,1,yyout)){}}while(0)
    #define jmp (yy_start) = 1 + 2 *
    int fileno (FILE *);
    int ischunked,chunksize,count;
   xa "\15"|"\12"
   xb "\15\12" 
   xc "HTTP/0.9"|"HTTP/1.0"|"HTTP/1.1"
   xd [Cc][Hh][Uu][Nn][Kk][Ee][Dd]
   xe [0-9a-fA-F]+\r\n
   xf [0-9a-fA-F]*\r\n
   %option noyywrap nounput noinput 
   %s xb xc xd xe xf
   %%
   ^{xc} echo;ischunked=0;jmp xc;
   <xc>^transfer-encoding: echo;jmp xb;
   <xb>\r\n\r\n echo;jmp xe;
   <xb>{xd} echo;ischunked=1;
   <xe>{xf}|{xe} {
   count=0;
   if(ischunked==1)
   {chunksize=strtol(yytext,NULL,16);
   jmp xd;};
   };
   <xd>{xb} jmp xf;
   <xd>. { 
   count++;
   if(count==chunksize)jmp xe;
   echo;
   };
   <xf>^[A-Fa-f0-9]+{xa}
   <xf>{xa}+[A-Fa-f0-9]+{xa}
   <xf>{xb}[A-Fa-f0-9]+{xb}
   %%
   int main(){ yylex();exit(0);}


Below is a short script that downloads and makes a PDF from the image files. No browser required.

The script uses a feature of HTTP/1.1 called pipelining; proponents of HTTP/2 and HTTP/3 want people to believe it has problems because it does not fit their commercialised web business model. As demonstrated by the script below, it has no problems. It's a feature that simply does not suit the online ad industry-funded business model with its gigantic corporate browser, bloated conglomeration web pages and incessant data collection. Here, only 2 TCP connections are used to retrieve 141 images. Most servers are less restrictive and allow more than 100 requests per TCP connection. Pipelining works great. Much more efficient than browsers which open hundreds of connections. IMHO.

    (export Connection=keep-alive
    x1=http://www.minimizedistraction.com/img/vrg_google_doc_final_vrs03-
    x2(){ seq -f "$x1%g.jpg" $1 $2;};
    x3(){ yy025|nc -vvn 173.236.175.199 80;};
    x2   1 100|x3;
    x2 101 200|x3;
    )|exec yy056|exec od -An -tx1 -vw99999|exec tr -d '\40'|exec sed 's/ffd9ffd8/ffd9\
    ffd8/g'|exec sed -n /ffd8/p|exec split -l1;
    for x in x??;do xxd -p -r < $x > $x.jpg;rm $x;done;
    convert x??.jpg 1.pdf 2>/dev/null;rm x??.jpg

    ls -l ./1.pdf
More details on yy025 and yy056 here: https://news.ycombinator.com/item?id=27769701


I make most HTTP requests using netcat or similar tcp clients so I write filters that read from stdin. Reading text files with the chunk sizes in hex interspersed is generally easy. Sometimes I do not even bother to remove the chunk sizes. Where it becomes an issue is when it breaks URLs. Here is a simple chunked transfer decoder that reads from stdin and removes the chunk sizes.

   flex -8iCrfa <<eof
    int fileno (FILE *);
   xa "\15"|"\12"
   xb "\15\12" 
   %option noyywrap nounput noinput 
   %%
   ^[A-Fa-f0-9]+{xa}
   {xa}+[A-Fa-f0-9]+{xa}
   {xb}[A-Fa-f0-9]+{xb} 
   %%
   int main(){ yylex();exit(0);}
   eof

   cc -std=c89 -Wall -pipe lex.yy.c -static -o yy045
Example

Yahoo! serves chunked pages

   printf 'GET / HTTP/1.1\r\nHost: us.yahoo.com\r\nConnection: close\r\n\r\n'|openssl s_client -connect us.yahoo.com:443 -ign_eof|./yy045


I tried this but ended up with gibberish in my terminal. Also couldn't find an explanation for -a on flex's man page. I've never used the thing before.


The extra "a" is a typo but would have no effect. The "i" is also superfluous but harmless. Without more details on the "gibberish" it is difficult to guess what happened. The space before "int fileno (FILE *);" is required. All the other lines must be left-justified, no leading spaces, except the line with "int main()" which can be indented if desired.


This is the script I am running - https://pastebin.com/65GxJ9i9. The - after << ignores the tabbed indents in heredoc.

This is what it produces for me when I run `lexit.sh us.yahoo.com` - https://stuff-storage.sfo3.digitaloceanspaces.com/ee.txt


https://news.ycombinator.com/item?id=27490265 <-- yy054

The "gibberish" is GZIP compressed data. "yy054" is a simple filter I wrote to extract a GZIP file from stdin, i.e., discard leading and trailing garabage. As far as I can tell, the compressed file "ee.txt" is not chunked transfer encoded. If it was chunked we would first extract the GZIP, then decompress and finally process the chunks (e.g., filter out the chunk sizes with the filter submitted in the OP).

In this case all we need to do is extract the GZIP file "ee.txt" from stdin, then decompress it:

    printf "GET /ee.txt\r\nHost: stuff-storage.sfo3.digitaloceanspaces.com\r\nConnection: close\r\n\r\n"|openssl s_client -connect 138.68.34.161:443 -quiet|yy054|gzip -dc > 1.htm
    firefox ./1.htm
   
Hope this helps. Apologies I initially guessed wrong on here doc. I was not sure what was meant by "gibberish". Looks like the here doc is working fine.


new pastebin. Had a typo in the old one- https://pastebin.com/4j9Z3eCc


Need to get rid of the leading spaces on all lines except the "int fileno" line. Can also forgo the "here doc" and just save the lines between "flex" and "eof" to a file. Run flex on that file. This will create lex.yy.c. Then compile lex.yy.c.

The compiled program is only useful for filtering chunked transfer encoding on stdin. Most "HTTP clients" like wget or curl already take care of processing chunked transfer encoding. It is when working with something like netcat that chunked tranfser encoding becomes "DIY". This is a simple program that attempts to solve that problem. It could be written by hand without using flex.


Okay I'll give up for now. There are really no spaces in front of the lines. In pastebin if you check the raw version you'll see they are tabs. Which get stripped out because I added a `-` before eof. Providing the file manually to flex also produces the same gibberish for me.


Corrections:

/int main/{s/input();//;s/return 0/exit(0)/;};/int yywrap/s/return 0/exit(0)/;/%option/s/$/ noinput/


    static void flush(void) {                                                                                                                 
      if (writeall(1, buf, buflen) == -1) _exit(errno);                                                                                     
      buflen = 0;                                                                                                                           
      }                                                                                                                                         
    static void wrch(const char ch) {                                                                                                         
      if (buflen >= sizeof buf) flush();                                                                                                    
      buf[buflen++] = ch;                                                                                                                   
      return;                                                                                                                               
     }                                                                                                                                         
    char inbuf[128];
    int main(int argc, char **argv) {
        long long r, i;
        for (;;) {
            r = read(0, inbuf, sizeof inbuf);
            if (r == -1) _exit(errno);
            if (r == 0) break;
            for (i = 0; i < r; ++i) {
                wrch("0123456789abcdef"[15 & (inbuf[i] >> 4)]);
                wrch("0123456789abcdef"[15 & inbuf[i]]);
            }
        }
        wrch('\n');
        return 0;
    }


    #include <unistd.h>
    #include <errno.h>
    #include <sys/types.h>
    int writeall(int fd,const void *xv,long long xlen)
    {
      const unsigned char *x = xv;
      long long w;
      while (xlen > 0) {
        w = xlen;
        if (w > 1048576) w = 1048576;
        w = write(fd,x,w);
        x += w;
        xlen -= w;
      }
      return 0;
    }
    static int hexdigit(char x)
    {
      if (x >= '0' && x <= '9') return x - '0';
      if (x >= 'a' && x <= 'f') return 10 + (x - 'a');
      if (x >= 'A' && x <= 'F') return 10 + (x - 'A');
      return -1;
    }
    int hexparse(unsigned char *y,long long len,const char *x)
    {
      if (!x) return 0;
      while (len > 0) {
        int digit0;
        int digit1;
        digit0 = hexdigit(x[0]); if (digit0 == -1) return 0;
        digit1 = hexdigit(x[1]); if (digit1 == -1) return 0;
        *y++ = digit1 + 16 * digit0;
        --len;
        x += 2;
      }
      if (x[0]) return 0;
      return 1;
    }


Join us for AI Startup School this June 16-17 in San Francisco!

Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact

Search: