Changeset 675ed04


Ignore:
Timestamp:
26/04/2012 17:08:28 (10 years ago)
Author:
Eric van der Vlist <vdv@dyomedea.com>
Branches:
master
Children:
466d447
Parents:
6f64c7f
git-author:
Eric van der Vlist <vdv@dyomedea.com> (26/04/2012 17:08:28)
git-committer:
Eric van der Vlist <vdv@dyomedea.com> (26/04/2012 17:08:28)
Message:

Download and convert the crawl log

Location:
archiver/pipelines/actions
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • archiver/pipelines/actions/get-heritrix-warc.xpl

    r51c2058 r675ed04  
    5656            <p:processor name="oxf:pipeline"> 
    5757                <p:input name="config" href="/data-access.xpl"/> 
    58                 <p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list)"> 
     58                <p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)"> 
    5959                    <config xsl:version="2.0"> 
    6060                        <relpath>queue.xml</relpath> 
     
    7979                            <xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/> 
    8080                        </parameter> 
     81                        <parameter name="log-url" type="string"> 
     82                            <xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/> 
     83                        </parameter> 
    8184                    </config> 
    8285                </p:input> 
     
    8790for $q in /queue return 
    8891    update  
    89         insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url)/> 
     92        insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/> 
    9093        into $q, 
    9194         
  • archiver/pipelines/actions/package-heritrix-warc.xpl

    rbe1a361 r675ed04  
    3535    </p:processor> 
    3636 
    37 <p:processor name="owk:from-warc-converter"> 
    38 <p:input name="data" href="#warc"/> 
    39 <p:output name="data" id="warc-xml" debug="warc-xml"/> 
    40 </p:processor> 
     37    <p:processor name="owk:from-warc-converter"> 
     38        <p:input name="data" href="#warc"/> 
     39        <p:output name="data" id="warc-xml" debug="warc-xml"/> 
     40    </p:processor> 
    4141 
    4242    <p:processor name="oxf:null-serializer"> 
    4343        <p:input name="data" href="#warc-xml"/> 
    4444    </p:processor> 
    45      
    46     <!-- Store it in a temp file --> 
     45 
     46    <!-- Download the log --> 
     47    <p:processor name="oxf:url-generator"> 
     48        <p:input name="config" transform="oxf:xslt" href="#data"> 
     49            <config xsl:version="2.0"> 
     50                <url> 
     51                    <xsl:value-of select="/action/@log-url"/> 
     52                </url> 
     53                <mode>text</mode> 
     54                <authentication> 
     55                    <username> 
     56                        <xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/> 
     57                    </username> 
     58                    <password> 
     59                        <xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/> 
     60                    </password> 
     61                    <preemptive>false</preemptive> 
     62                </authentication> 
     63            </config> 
     64        </p:input> 
     65        <p:output name="data" id="log" debug="log"/> 
     66    </p:processor> 
     67 
     68    <p:processor name="oxf:xslt"> 
     69        <p:input name="data" href="#log"/> 
     70        <p:input name="config" href="parse-log.xslt"></p:input> 
     71        <p:output name="data" id="log-xml" debug="log-xml"/> 
     72    </p:processor> 
     73 
     74    <p:processor name="oxf:null-serializer"> 
     75        <p:input name="data" href="#log-xml"/> 
     76    </p:processor> 
     77 
     78 
     79    <!-- Store the WARC in a temp file --> 
    4780    <p:processor name="oxf:file-serializer"> 
    4881        <p:input name="config"> 
     
    84117        <p:output name="data" id="zip"/> 
    85118    </p:processor> 
    86      
     119 
    87120    <p:processor name="oxf:file-serializer"> 
    88121        <p:input name="config"> 
     
    92125        </p:input> 
    93126        <p:input name="data" href="#zip"/> 
    94          
    95     </p:processor> 
    96      
    97    <!-- <p:choose href="#heritrix-job"> 
     127 
     128    </p:processor> 
     129 
     130    <!-- <p:choose href="#heritrix-job"> 
    98131        <p:when test="/job/crawlControllerState='FINISHED'"> 
    99132            <!-\- The job is finished, we can get its archive... -\-> 
Note: See TracChangeset for help on using the changeset viewer.